root/net/inet/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. min
  2. tcp_set_state
  3. tcp_select_window
  4. tcp_find_established
  5. tcp_close_pending
  6. tcp_dequeue_established
  7. tcp_time_wait
  8. tcp_do_retransmit
  9. tcp_retransmit_time
  10. tcp_retransmit
  11. tcp_err
  12. tcp_readable
  13. do_tcp_select
  14. tcp_select
  15. tcp_ioctl
  16. tcp_check
  17. tcp_send_check
  18. tcp_send_skb
  19. tcp_dequeue_partial
  20. tcp_send_partial
  21. tcp_enqueue_partial
  22. tcp_send_ack
  23. tcp_build_header
  24. tcp_write
  25. tcp_sendto
  26. tcp_read_wakeup
  27. cleanup_rbuf
  28. tcp_read_urg
  29. tcp_read
  30. tcp_shutdown
  31. tcp_recvfrom
  32. tcp_reset
  33. tcp_options
  34. default_mask
  35. tcp_init_seq
  36. tcp_conn_request
  37. tcp_close
  38. tcp_write_xmit
  39. tcp_ack
  40. tcp_fin
  41. tcp_data
  42. tcp_check_urg
  43. tcp_urg
  44. tcp_accept
  45. tcp_connect
  46. tcp_sequence
  47. tcp_std_reset
  48. tcp_rcv
  49. tcp_write_wakeup
  50. tcp_send_probe0
  51. tcp_setsockopt
  52. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@no.unit.nvg>
  20  *
  21  * Fixes:       
  22  *              Alan Cox        :       Numerous verify_area() calls
  23  *              Alan Cox        :       Set the ACK bit on a reset
  24  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  25  *                                      and was trying to connect (tcp_err()).
  26  *              Alan Cox        :       All icmp error handling was broken
  27  *                                      pointers passed where wrong and the
  28  *                                      socket was looked up backwards. Nobody
  29  *                                      tested any icmp error code obviously.
  30  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  31  *                                      on errors. select behaves and the icmp error race
  32  *                                      has gone by moving it into sock.c
  33  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  34  *                                      packets for unknown sockets.
  35  *              Alan Cox        :       tcp option processing.
  36  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  37  *              Herp Rosmanith  :       More reset fixes
  38  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  39  *                                      any kind of RST is right out.
  40  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  41  *                                      otherwise odd bits of prattle escape still
  42  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  43  *                                      LAN workplace lockups.
  44  *              Alan Cox        :       Some tidyups using the new skb list facilities
  45  *              Alan Cox        :       sk->keepopen now seems to work
  46  *              Alan Cox        :       Pulls options out correctly on accepts
  47  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  48  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  49  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  50  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  51  *              Alan Cox        :       Removed incorrect check for 20 * psh
  52  *      Michael O'Reilly        :       ack < copied bug fix.
  53  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  54  *              Alan Cox        :       FIN with no memory -> CRASH
  55  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  56  *              Alan Cox        :       Added TCP options (SOL_TCP)
  57  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  58  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  59  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  60  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  61  *              Alan Cox        :       Put in missing check for SYN bit.
  62  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  63  *                                      window non shrink trick.
  64  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  65  *              Charles Hedrick :       TCP fixes
  66  *              Toomas Tamm     :       TCP window fixes
  67  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  68  *              Charles Hedrick :       Rewrote most of it to actually work
  69  *              Linus           :       Rewrote tcp_read() and URG handling
  70  *                                      completely
  71  *              Gerhard Koerting:       Fixed some missing timer handling
  72  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  73  *              Gerhard Koerting:       PC/TCP workarounds
  74  *              Adam Caldwell   :       Assorted timer/timing errors
  75  *              Matthew Dillon  :       Fixed another RST bug
  76  *              Alan Cox        :       Move to kernel side addressing changes.
  77  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  78  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  79  *              Alan Cox        :       TCP fast path debugging
  80  *              Alan Cox        :       Window clamping
  81  *              Michael Riepe   :       Bug in tcp_check()
  82  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  83  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  84  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  85  *              Alan Cox        :       BSD accept semantics. 
  86  *              Alan Cox        :       Reset on closedown bug.
  87  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  88  *              Michael Pall    :       Handle select() after URG properly in all cases.
  89  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  90  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  91  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  92  *              Alan Cox        :       Changed the semantics of sk->socket to 
  93  *                                      fix a race and a signal problem with
  94  *                                      accept() and async I/O.
  95  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  96  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  97  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  98  *                                      clients/servers which listen in on
  99  *                                      fixed ports.
 100  *              Alan Cox        :       Cleaned the above up and shrank it to
 101  *                                      a sensible code size.
 102  *              Alan Cox        :       Self connect lockup fix.
 103  *              Alan Cox        :       No connect to multicast.
 104  *              Ross Biro       :       Close unaccepted children on master
 105  *                                      socket close.
 106  *              Alan Cox        :       Reset tracing code.
 107  *              Alan Cox        :       Spurious resets on shutdown.
 108  *              Alan Cox        :       Giant 15 minute/60 second timer error
 109  *              Alan Cox        :       Small whoops in selecting before an accept.
 110  *              Alan Cox        :       Kept the state trace facility since its
 111  *                                      handy for debugging.
 112  *              Alan Cox        :       More reset handler fixes.
 113  *              Alan Cox        :       Started rewriting the code based on the RFC's
 114  *                                      for other useful protocol references see:  
 115  *                                      Comer, KA9Q NOS, and for a reference on the
 116  *                                      difference between specifications and how BSD
 117  *                                      works see the 4.4lite source.
 118  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 119  *                                      close.
 120  *
 121  *
 122  * To Fix:
 123  *              Fast path the code. Two things here - fix the window calculation
 124  *              so it doesn't iterate over the queue, also spot packets with no funny
 125  *              options arriving in order and process directly.
 126  *
 127  *              Implement RFC 1191 [Path MTU discovery]
 128  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 129  *              Rewrite output state machine to use a single queue and do low window
 130  *              situations as per the spec (RFC 1122)
 131  *              Speed up input assembly algorithm.
 132  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 133  *              could do with it working on IPv4
 134  *              User settable/learned rtt/max window/mtu
 135  *              Cope with MTU/device switches when retransmitting in tcp.
 136  *
 137  *
 138  *
 139  *              This program is free software; you can redistribute it and/or
 140  *              modify it under the terms of the GNU General Public License
 141  *              as published by the Free Software Foundation; either version
 142  *              2 of the License, or(at your option) any later version.
 143  *
 144  * Description of States:
 145  *
 146  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 147  *
 148  *      TCP_SYN_RECV            received a connection request, sent ack,
 149  *                              waiting for final ack in three-way handshake.
 150  *
 151  *      TCP_ESTABLISHED         connection established
 152  *
 153  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 154  *                              transmission of remaining buffered data
 155  *
 156  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 157  *                              to shutdown
 158  *
 159  *      TCP_CLOSING             both sides have shutdown but we still have
 160  *                              data we have to finish sending
 161  *
 162  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 163  *                              closed, can only be entered from FIN_WAIT2
 164  *                              or CLOSING.  Required because the other end
 165  *                              may not have gotten our last ACK causing it
 166  *                              to retransmit the data packet (which we ignore)
 167  *
 168  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 169  *                              us to finish writing our data and to shutdown
 170  *                              (we have to close() to move on to LAST_ACK)
 171  *
 172  *      TCP_LAST_ACK            out side has shutdown after remote has
 173  *                              shutdown.  There may still be data in our
 174  *                              buffer that we have to finish sending
 175  *              
 176  *      TCP_CLOSE               socket is finished
 177  */
 178 #include <linux/types.h>
 179 #include <linux/sched.h>
 180 #include <linux/mm.h>
 181 #include <linux/string.h>
 182 #include <linux/socket.h>
 183 #include <linux/sockios.h>
 184 #include <linux/termios.h>
 185 #include <linux/in.h>
 186 #include <linux/fcntl.h>
 187 #include <linux/inet.h>
 188 #include <linux/netdevice.h>
 189 #include "snmp.h"
 190 #include "ip.h"
 191 #include "protocol.h"
 192 #include "icmp.h"
 193 #include "tcp.h"
 194 #include <linux/skbuff.h>
 195 #include "sock.h"
 196 #include "route.h"
 197 #include <linux/errno.h>
 198 #include <linux/timer.h>
 199 #include <asm/system.h>
 200 #include <asm/segment.h>
 201 #include <linux/mm.h>
 202 
 203 #undef TCP_FASTPATH
 204 
 205 #define SEQ_TICK 3
 206 unsigned long seq_offset;
 207 struct tcp_mib  tcp_statistics;
 208 
 209 static void tcp_close(struct sock *sk, int timeout);
 210 
 211 #ifdef TCP_FASTPATH
 212 unsigned long tcp_rx_miss=0, tcp_rx_hit1=0, tcp_rx_hit2=0;
 213 #endif
 214 
 215 /* The less said about this the better, but it works and will do for 1.2 */
 216 
 217 static struct wait_queue *master_select_wakeup;
 218 
 219 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 220 {
 221         if (a < b) 
 222                 return(a);
 223         return(b);
 224 }
 225 
 226 #undef STATE_TRACE
 227 
 228 #ifdef STATE_TRACE
 229 static char *statename[]={
 230         "Unused","Established","Syn Sent","Syn Recv",
 231         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 232         "Close Wait","Last ACK","Listen","Closing"
 233 };
 234 #endif
 235 
 236 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 237 {
 238         if(sk->state==TCP_ESTABLISHED)
 239                 tcp_statistics.TcpCurrEstab--;
 240 #ifdef STATE_TRACE
 241         if(sk->debug)
 242                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 243 #endif  
 244         /* This is a hack but it doesn't occur often and its going to
 245            be a real        to fix nicely */
 246            
 247         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 248         {
 249                 wake_up_interruptible(&master_select_wakeup);
 250         }
 251         sk->state=state;
 252         if(state==TCP_ESTABLISHED)
 253                 tcp_statistics.TcpCurrEstab++;
 254 }
 255 
 256 /* This routine picks a TCP windows for a socket based on
 257    the following constraints
 258    
 259    1. The window can never be shrunk once it is offered (RFC 793)
 260    2. We limit memory per socket
 261    
 262    For now we use NET2E3's heuristic of offering half the memory
 263    we have handy. All is not as bad as this seems however because
 264    of two things. Firstly we will bin packets even within the window
 265    in order to get the data we are waiting for into the memory limit.
 266    Secondly we bin common duplicate forms at receive time
 267    
 268    Better heuristics welcome
 269 */
 270    
 271 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 272 {
 273         int new_window = sk->prot->rspace(sk);
 274         
 275         if(sk->window_clamp)
 276                 new_window=min(sk->window_clamp,new_window);
 277 /*
 278  * two things are going on here.  First, we don't ever offer a
 279  * window less than min(sk->mss, MAX_WINDOW/2).  This is the
 280  * receiver side of SWS as specified in RFC1122.
 281  * Second, we always give them at least the window they
 282  * had before, in order to avoid retracting window.  This
 283  * is technically allowed, but RFC1122 advises against it and
 284  * in practice it causes trouble.
 285  */
 286         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 287                 return(sk->window);
 288         return(new_window);
 289 }
 290 
 291 /*
 292  *      Find someone to 'accept'. Must be called with
 293  *      sk->inuse=1 or cli()
 294  */ 
 295 
 296 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 297 {
 298         struct sk_buff *p=skb_peek(&s->receive_queue);
 299         if(p==NULL)
 300                 return NULL;
 301         do
 302         {
 303                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 304                         return p;
 305                 p=p->next;
 306         }
 307         while(p!=(struct sk_buff *)&s->receive_queue);
 308         return NULL;
 309 }
 310 
 311 
 312 /* 
 313  *      This routine closes sockets which have been at least partially
 314  *      opened, but not yet accepted. Currently it is only called by
 315  *      tcp_close, and timeout mirrors the value there. 
 316  */
 317 
 318 static void tcp_close_pending (struct sock *sk, int timeout) 
     /* [previous][next][first][last][top][bottom][index][help] */
 319 {
 320         struct sk_buff *skb;
 321 
 322         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
 323                 tcp_close(skb->sk, timeout);
 324                 kfree_skb(skb, FREE_READ);
 325         }
 326         return;
 327 }
 328 
 329 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 330 {
 331         struct sk_buff *skb;
 332         unsigned long flags;
 333         save_flags(flags);
 334         cli(); 
 335         skb=tcp_find_established(s);
 336         if(skb!=NULL)
 337                 skb_unlink(skb);        /* Take it off the queue */
 338         restore_flags(flags);
 339         return skb;
 340 }
 341 
 342 
 343 /*
 344  *      Enter the time wait state. 
 345  */
 346 
 347 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 348 {
 349         tcp_set_state(sk,TCP_TIME_WAIT);
 350         sk->shutdown = SHUTDOWN_MASK;
 351         if (!sk->dead)
 352                 sk->state_change(sk);
 353         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 354 }
 355 
 356 /*
 357  *      A socket has timed out on its send queue and wants to do a
 358  *      little retransmitting. Currently this means TCP.
 359  */
 360 
 361 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 362 {
 363         struct sk_buff * skb;
 364         struct proto *prot;
 365         struct device *dev;
 366 
 367         prot = sk->prot;
 368         skb = sk->send_head;
 369 
 370         while (skb != NULL)
 371         {
 372                 struct tcphdr *th;
 373                 struct iphdr *iph;
 374                 int size;
 375 
 376                 dev = skb->dev;
 377                 IS_SKB(skb);
 378                 skb->when = jiffies;
 379 
 380                 /*
 381                  * In general it's OK just to use the old packet.  However we
 382                  * need to use the current ack and window fields.  Urg and
 383                  * urg_ptr could possibly stand to be updated as well, but we
 384                  * don't keep the necessary data.  That shouldn't be a problem,
 385                  * if the other end is doing the right thing.  Since we're
 386                  * changing the packet, we have to issue a new IP identifier.
 387                  */
 388 
 389 
 390                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 391                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 392                 size = skb->len - (((unsigned char *) th) - skb->data);
 393 
 394                 iph->id = htons(ip_id_count++);
 395                 ip_send_check(iph);
 396 
 397                 /*
 398                  *      This is not the right way to handle this. We have to
 399                  *      issue an up to date window and ack report with this 
 400                  *      retransmit to keep the odd buggy tcp that relies on 
 401                  *      the fact BSD does this happy. 
 402                  *      We don't however need to recalculate the entire 
 403                  *      checksum, so someone wanting a small problem to play
 404                  *      with might like to implement RFC1141/RFC1624 and speed
 405                  *      this up by avoiding a full checksum.
 406                  */
 407                  
 408                 th->ack_seq = ntohl(sk->acked_seq);
 409                 th->window = ntohs(tcp_select_window(sk));
 410                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 411                 
 412                 /*
 413                  *      If the interface is (still) up and running, kick it.
 414                  */
 415 
 416                 if (dev->flags & IFF_UP)
 417                 {
 418                         /*
 419                          *      If the packet is still being sent by the device/protocol
 420                          *      below then don't retransmit. This is both needed, and good -
 421                          *      especially with connected mode AX.25 where it stops resends
 422                          *      occurring of an as yet unsent anyway frame!
 423                          *      We still add up the counts as the round trip time wants
 424                          *      adjusting.
 425                          */
 426                         if (sk && !skb_device_locked(skb))
 427                         {
 428                                 /* Remove it from any existing driver queue first! */
 429                                 skb_unlink(skb);
 430                                 /* Now queue it */
 431                                 ip_statistics.IpOutRequests++;
 432                                 dev_queue_xmit(skb, dev, sk->priority);
 433                         }
 434                 }
 435 
 436                 /*
 437                  *      Count retransmissions
 438                  */
 439                 sk->retransmits++;
 440                 sk->prot->retransmits ++;
 441 
 442                 /*
 443                  *      Only one retransmit requested.
 444                  */
 445                 if (!all)
 446                         break;
 447 
 448                 /*
 449                  *      This should cut it off before we send too many packets.
 450                  */
 451                 if (sk->retransmits >= sk->cong_window)
 452                         break;
 453                 skb = skb->link3;
 454         }
 455 }
 456 
 457 /*
 458  *      This is the normal code called for timeouts.  It does the retransmission
 459  *      and then does backoff.  tcp_do_retransmit is separated out because
 460  *      tcp_ack needs to send stuff from the retransmit queue without
 461  *      initiating a backoff.
 462  */
 463 
 464 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 465 {
 466         tcp_do_retransmit(sk, all);
 467 
 468         /*
 469          * Increase the timeout each time we retransmit.  Note that
 470          * we do not increase the rtt estimate.  rto is initialized
 471          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 472          * that doubling rto each time is the least we can get away with.
 473          * In KA9Q, Karn uses this for the first few times, and then
 474          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 475          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 476          * defined in the protocol as the maximum possible RTT.  I guess
 477          * we'll have to use something other than TCP to talk to the
 478          * University of Mars.
 479          */
 480 
 481         sk->retransmits++;
 482         sk->backoff++;
 483         sk->rto = min(sk->rto << 1, 120*HZ);
 484         reset_timer(sk, TIME_WRITE, sk->rto);
 485 }
 486 
 487 
 488 /*
 489  *      A timer event has trigger a tcp retransmit timeout. The
 490  *      socket xmit queue is ready and set up to send. Because
 491  *      the ack receive code keeps the queue straight we do
 492  *      nothing clever here.
 493  */
 494 
 495 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 496 {
 497         if (all) 
 498         {
 499                 tcp_retransmit_time(sk, all);
 500                 return;
 501         }
 502 
 503         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 504         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 505         sk->cong_count = 0;
 506 
 507         sk->cong_window = 1;
 508 
 509         /* Do the actual retransmit. */
 510         tcp_retransmit_time(sk, all);
 511 }
 512 
 513 
 514 /*
 515  * This routine is called by the ICMP module when it gets some
 516  * sort of error condition.  If err < 0 then the socket should
 517  * be closed and the error returned to the user.  If err > 0
 518  * it's just the icmp type << 8 | icmp code.  After adjustment
 519  * header points to the first 8 bytes of the tcp header.  We need
 520  * to find the appropriate port.
 521  */
 522 
 523 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 524         unsigned long saddr, struct inet_protocol *protocol)
 525 {
 526         struct tcphdr *th;
 527         struct sock *sk;
 528         struct iphdr *iph=(struct iphdr *)header;
 529   
 530         header+=4*iph->ihl;
 531    
 532 
 533         th =(struct tcphdr *)header;
 534         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 535 
 536         if (sk == NULL) 
 537                 return;
 538   
 539         if(err<0)
 540         {
 541                 sk->err = -err;
 542                 sk->error_report(sk);
 543                 return;
 544         }
 545 
 546         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 547         {
 548                 /*
 549                  * FIXME:
 550                  * For now we will just trigger a linear backoff.
 551                  * The slow start code should cause a real backoff here.
 552                  */
 553                 if (sk->cong_window > 4)
 554                         sk->cong_window--;
 555                 return;
 556         }
 557 
 558 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 559 
 560         /*
 561          * If we've already connected we will keep trying
 562          * until we time out, or the user gives up.
 563          */
 564 
 565         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 566         {
 567                 if (sk->state == TCP_SYN_SENT) 
 568                 {
 569                         tcp_statistics.TcpAttemptFails++;
 570                         tcp_set_state(sk,TCP_CLOSE);
 571                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 572                 }
 573                 sk->err = icmp_err_convert[err & 0xff].errno;           
 574         }
 575         return;
 576 }
 577 
 578 
 579 /*
 580  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 581  *      in the received data queue (ie a frame missing that needs sending to us)
 582  */
 583 
 584 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 585 {
 586         unsigned long counted;
 587         unsigned long amount;
 588         struct sk_buff *skb;
 589         int sum;
 590         unsigned long flags;
 591 
 592         if(sk && sk->debug)
 593                 printk("tcp_readable: %p - ",sk);
 594 
 595         save_flags(flags);
 596         cli();
 597         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 598         {
 599                 restore_flags(flags);
 600                 if(sk && sk->debug) 
 601                         printk("empty\n");
 602                 return(0);
 603         }
 604   
 605         counted = sk->copied_seq;       /* Where we are at the moment */
 606         amount = 0;
 607   
 608         /* Do until a push or until we are out of data. */
 609         do 
 610         {
 611                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 612                         break;
 613                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 614                 if (skb->h.th->syn)
 615                         sum++;
 616                 if (sum > 0) 
 617                 {                                       /* Add it up, move on */
 618                         amount += sum;
 619                         if (skb->h.th->syn) 
 620                                 amount--;
 621                         counted += sum;
 622                 }
 623                 /*
 624                  * Don't count urg data ... but do it in the right place!
 625                  * Consider: "old_data (ptr is here) URG PUSH data"
 626                  * The old code would stop at the first push because
 627                  * it counted the urg (amount==1) and then does amount--
 628                  * *after* the loop.  This means tcp_readable() always
 629                  * returned zero if any URG PUSH was in the queue, even
 630                  * though there was normal data available. If we subtract
 631                  * the urg data right here, we even get it to work for more
 632                  * than one URG PUSH skb without normal data.
 633                  * This means that select() finally works now with urg data
 634                  * in the queue.  Note that rlogin was never affected
 635                  * because it doesn't use select(); it uses two processes
 636                  * and a blocking read().  And the queue scan in tcp_read()
 637                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 638                  */
 639                 if (skb->h.th->urg)
 640                         amount--;       /* don't count urg data */
 641                 if (amount && skb->h.th->psh) break;
 642                 skb = skb->next;
 643         }
 644         while(skb != (struct sk_buff *)&sk->receive_queue);
 645 
 646         restore_flags(flags);
 647         if(sk->debug)
 648                 printk("got %lu bytes.\n",amount);
 649         return(amount);
 650 }
 651 
 652 
 653 /*
 654  *      Wait for a TCP event. Note the oddity with SEL_IN and reading. The
 655  *      listening socket has a receive queue of sockets to accept.
 656  */
 657 
 658 static int do_tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 659 {
 660         switch(sel_type) 
 661         {
 662                 case SEL_IN:
 663                         if (sk->err)
 664                                 return 1;
 665                         if (sk->state == TCP_LISTEN) {
 666                                 select_wait(&master_select_wakeup,wait);
 667                                 return (tcp_find_established(sk) != NULL);
 668                         }
 669                         if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 670                                 return 0;
 671                         if (sk->acked_seq != sk->copied_seq)
 672                                 return 1;
 673                         if (sk->shutdown & RCV_SHUTDOWN)
 674                                 return 1;
 675                         return 0;
 676 
 677                 case SEL_OUT:
 678                         if (sk->shutdown & SEND_SHUTDOWN) {
 679                                 /* FIXME: should this return an error? */
 680                                 return 0;
 681                         }
 682 
 683                         /*
 684                          * This is now right thanks to a small fix
 685                          * by Matt Dillon.
 686                          */
 687                         
 688                         if (sk->prot->wspace(sk) >= sk->mtu+128+sk->prot->max_header) 
 689                         {
 690                                 /* This should cause connect to work ok. */
 691                                 if (sk->state == TCP_SYN_RECV ||
 692                                     sk->state == TCP_SYN_SENT) return 0;
 693                                 return 1;
 694                         }
 695                         return 0;
 696 
 697                 case SEL_EX:
 698                         if (sk->err || sk->urg_data)
 699                                 return 1;
 700                         return 0;
 701         }
 702         return 0;
 703 }
 704 
 705 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 706 {
 707         int retval;
 708 
 709         sk->inuse = 1;
 710         select_wait(sk->sleep, wait);
 711         retval = do_tcp_select(sk, sel_type, wait);
 712         release_sock(sk);
 713         return retval;
 714 }
 715 
 716 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 717 {
 718         int err;
 719         switch(cmd) 
 720         {
 721 
 722                 case TIOCINQ:
 723 #ifdef FIXME    /* FIXME: */
 724                 case FIONREAD:
 725 #endif
 726                 {
 727                         unsigned long amount;
 728 
 729                         if (sk->state == TCP_LISTEN) 
 730                                 return(-EINVAL);
 731 
 732                         sk->inuse = 1;
 733                         amount = tcp_readable(sk);
 734                         release_sock(sk);
 735                         err=verify_area(VERIFY_WRITE,(void *)arg,
 736                                                    sizeof(unsigned long));
 737                         if(err)
 738                                 return err;
 739                         put_fs_long(amount,(unsigned long *)arg);
 740                         return(0);
 741                 }
 742                 case SIOCATMARK:
 743                 {
 744                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
 745 
 746                         err = verify_area(VERIFY_WRITE,(void *) arg,
 747                                                   sizeof(unsigned long));
 748                         if (err)
 749                                 return err;
 750                         put_fs_long(answ,(int *) arg);
 751                         return(0);
 752                 }
 753                 case TIOCOUTQ:
 754                 {
 755                         unsigned long amount;
 756 
 757                         if (sk->state == TCP_LISTEN) return(-EINVAL);
 758                         amount = sk->prot->wspace(sk);
 759                         err=verify_area(VERIFY_WRITE,(void *)arg,
 760                                                    sizeof(unsigned long));
 761                         if(err)
 762                                 return err;
 763                         put_fs_long(amount,(unsigned long *)arg);
 764                         return(0);
 765                 }
 766                 default:
 767                         return(-EINVAL);
 768         }
 769 }
 770 
 771 
 772 /*
 773  *      This routine computes a TCP checksum. 
 774  */
 775  
 776 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
 777           unsigned long saddr, unsigned long daddr)
 778 {     
 779         unsigned long sum;
 780    
 781         if (saddr == 0) saddr = ip_my_addr();
 782 
 783 /*
 784  * stupid, gcc complains when I use just one __asm__ block,
 785  * something about too many reloads, but this is just two
 786  * instructions longer than what I want
 787  */
 788         __asm__("
 789             addl %%ecx, %%ebx
 790             adcl %%edx, %%ebx
 791             adcl $0, %%ebx
 792             "
 793         : "=b"(sum)
 794         : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
 795         : "bx", "cx", "dx" );
 796         __asm__("
 797             movl %%ecx, %%edx
 798             cld
 799             cmpl $32, %%ecx
 800             jb 2f
 801             shrl $5, %%ecx
 802             clc
 803 1:          lodsl
 804             adcl %%eax, %%ebx
 805             lodsl
 806             adcl %%eax, %%ebx
 807             lodsl
 808             adcl %%eax, %%ebx
 809             lodsl
 810             adcl %%eax, %%ebx
 811             lodsl
 812             adcl %%eax, %%ebx
 813             lodsl
 814             adcl %%eax, %%ebx
 815             lodsl
 816             adcl %%eax, %%ebx
 817             lodsl
 818             adcl %%eax, %%ebx
 819             loop 1b
 820             adcl $0, %%ebx
 821             movl %%edx, %%ecx
 822 2:          andl $28, %%ecx
 823             je 4f
 824             shrl $2, %%ecx
 825             clc
 826 3:          lodsl
 827             adcl %%eax, %%ebx
 828             loop 3b
 829             adcl $0, %%ebx
 830 4:          movl $0, %%eax
 831             testw $2, %%dx
 832             je 5f
 833             lodsw
 834             addl %%eax, %%ebx
 835             adcl $0, %%ebx
 836             movw $0, %%ax
 837 5:          test $1, %%edx
 838             je 6f
 839             lodsb
 840             addl %%eax, %%ebx
 841             adcl $0, %%ebx
 842 6:          movl %%ebx, %%eax
 843             shrl $16, %%eax
 844             addw %%ax, %%bx
 845             adcw $0, %%bx
 846             "
 847         : "=b"(sum)
 848         : "0"(sum), "c"(len), "S"(th)
 849         : "ax", "bx", "cx", "dx", "si" );
 850 
 851         /* We only want the bottom 16 bits, but we never cleared the top 16. */
 852   
 853         return((~sum) & 0xffff);
 854 }
 855 
 856 
 857 
 858 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
 859                 unsigned long daddr, int len, struct sock *sk)
 860 {
 861         th->check = 0;
 862         th->check = tcp_check(th, len, saddr, daddr);
 863         return;
 864 }
 865 
 866 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
 867 {
 868         int size;
 869         struct tcphdr * th = skb->h.th;
 870 
 871         /* length of packet (not counting length of pre-tcp headers) */
 872         size = skb->len - ((unsigned char *) th - skb->data);
 873 
 874         /* sanity check it.. */
 875         if (size < sizeof(struct tcphdr) || size > skb->len) 
 876         {
 877                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
 878                         skb, skb->data, th, skb->len);
 879                 kfree_skb(skb, FREE_WRITE);
 880                 return;
 881         }
 882 
 883         /* If we have queued a header size packet.. */
 884         if (size == sizeof(struct tcphdr)) 
 885         {
 886                 /* If its got a syn or fin its notionally included in the size..*/
 887                 if(!th->syn && !th->fin) 
 888                 {
 889                         printk("tcp_send_skb: attempt to queue a bogon.\n");
 890                         kfree_skb(skb,FREE_WRITE);
 891                         return;
 892                 }
 893         }
 894 
 895         tcp_statistics.TcpOutSegs++;  
 896 
 897         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
 898         if (after(skb->h.seq, sk->window_seq) ||
 899             (sk->retransmits && sk->timeout == TIME_WRITE) ||
 900              sk->packets_out >= sk->cong_window) 
 901         {
 902                 /* checksum will be supplied by tcp_write_xmit.  So
 903                  * we shouldn't need to set it at all.  I'm being paranoid */
 904                 th->check = 0;
 905                 if (skb->next != NULL) 
 906                 {
 907                         printk("tcp_send_partial: next != NULL\n");
 908                         skb_unlink(skb);
 909                 }
 910                 skb_queue_tail(&sk->write_queue, skb);
 911                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
 912                     sk->send_head == NULL &&
 913                     sk->ack_backlog == 0)
 914                         reset_timer(sk, TIME_PROBE0, sk->rto);
 915         } 
 916         else 
 917         {
 918                 th->ack_seq = ntohl(sk->acked_seq);
 919                 th->window = ntohs(tcp_select_window(sk));
 920 
 921                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 922 
 923                 sk->sent_seq = sk->write_seq;
 924                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
 925         }
 926 }
 927 
 928 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 929 {
 930         struct sk_buff * skb;
 931         unsigned long flags;
 932 
 933         save_flags(flags);
 934         cli();
 935         skb = sk->partial;
 936         if (skb) {
 937                 sk->partial = NULL;
 938                 del_timer(&sk->partial_timer);
 939         }
 940         restore_flags(flags);
 941         return skb;
 942 }
 943 
 944 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 945 {
 946         struct sk_buff *skb;
 947 
 948         if (sk == NULL)
 949                 return;
 950         while ((skb = tcp_dequeue_partial(sk)) != NULL)
 951                 tcp_send_skb(sk, skb);
 952 }
 953 
 954 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 955 {
 956         struct sk_buff * tmp;
 957         unsigned long flags;
 958 
 959         save_flags(flags);
 960         cli();
 961         tmp = sk->partial;
 962         if (tmp)
 963                 del_timer(&sk->partial_timer);
 964         sk->partial = skb;
 965         init_timer(&sk->partial_timer);
 966         sk->partial_timer.expires = HZ;
 967         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
 968         sk->partial_timer.data = (unsigned long) sk;
 969         add_timer(&sk->partial_timer);
 970         restore_flags(flags);
 971         if (tmp)
 972                 tcp_send_skb(sk, tmp);
 973 }
 974 
 975 
 976 /*
 977  *      This routine sends an ack and also updates the window. 
 978  */
 979  
 980 static void tcp_send_ack(unsigned long sequence, unsigned long ack,
     /* [previous][next][first][last][top][bottom][index][help] */
 981              struct sock *sk,
 982              struct tcphdr *th, unsigned long daddr)
 983 {
 984         struct sk_buff *buff;
 985         struct tcphdr *t1;
 986         struct device *dev = NULL;
 987         int tmp;
 988 
 989         if(sk->zapped)
 990                 return;         /* We have been reset, we may not send again */
 991         /*
 992          * We need to grab some memory, and put together an ack,
 993          * and then put it into the queue to be sent.
 994          */
 995 
 996         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
 997         if (buff == NULL) 
 998         {
 999                 /* Force it to send an ack. */
1000                 sk->ack_backlog++;
1001                 if (sk->timeout != TIME_WRITE && tcp_connected(sk->state)) 
1002                 {
1003                         reset_timer(sk, TIME_WRITE, 10);
1004                 }
1005                 return;
1006         }
1007 
1008         buff->len = sizeof(struct tcphdr);
1009         buff->sk = sk;
1010         buff->localroute = sk->localroute;
1011         t1 =(struct tcphdr *) buff->data;
1012 
1013         /* Put in the IP header and routing stuff. */
1014         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1015                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1016         if (tmp < 0) 
1017         {
1018                 buff->free=1;
1019                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1020                 return;
1021         }
1022         buff->len += tmp;
1023         t1 =(struct tcphdr *)((char *)t1 +tmp);
1024 
1025         /* FIXME: */
1026         memcpy(t1, th, sizeof(*t1)); /* this should probably be removed */
1027 
1028         /*
1029          *      Swap the send and the receive. 
1030          */
1031          
1032         t1->dest = th->source;
1033         t1->source = th->dest;
1034         t1->seq = ntohl(sequence);
1035         t1->ack = 1;
1036         sk->window = tcp_select_window(sk);
1037         t1->window = ntohs(sk->window);
1038         t1->res1 = 0;
1039         t1->res2 = 0;
1040         t1->rst = 0;
1041         t1->urg = 0;
1042         t1->syn = 0;
1043         t1->psh = 0;
1044         t1->fin = 0;
1045         if (ack == sk->acked_seq) 
1046         {
1047                 sk->ack_backlog = 0;
1048                 sk->bytes_rcv = 0;
1049                 sk->ack_timed = 0;
1050                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1051                                   && sk->timeout == TIME_WRITE) 
1052                 {
1053                         if(sk->keepopen) {
1054                                 reset_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1055                         } else {
1056                                 delete_timer(sk);
1057                         }
1058                 }
1059         }
1060         t1->ack_seq = ntohl(ack);
1061         t1->doff = sizeof(*t1)/4;
1062         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1063         if (sk->debug)
1064                  printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1065         tcp_statistics.TcpOutSegs++;
1066         sk->prot->queue_xmit(sk, dev, buff, 1);
1067 }
1068 
1069 
1070 /* 
1071  *      This routine builds a generic TCP header. 
1072  */
1073  
1074 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1075 {
1076 
1077         /* FIXME: want to get rid of this. */
1078         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1079         th->seq = htonl(sk->write_seq);
1080         th->psh =(push == 0) ? 1 : 0;
1081         th->doff = sizeof(*th)/4;
1082         th->ack = 1;
1083         th->fin = 0;
1084         sk->ack_backlog = 0;
1085         sk->bytes_rcv = 0;
1086         sk->ack_timed = 0;
1087         th->ack_seq = htonl(sk->acked_seq);
1088         sk->window = tcp_select_window(sk);
1089         th->window = htons(sk->window);
1090 
1091         return(sizeof(*th));
1092 }
1093 
1094 /*
1095  *      This routine copies from a user buffer into a socket,
1096  *      and starts the transmit system.
1097  */
1098 
1099 static int tcp_write(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1100           int len, int nonblock, unsigned flags)
1101 {
1102         int copied = 0;
1103         int copy;
1104         int tmp;
1105         struct sk_buff *skb;
1106         struct sk_buff *send_tmp;
1107         unsigned char *buff;
1108         struct proto *prot;
1109         struct device *dev = NULL;
1110 
1111         sk->inuse=1;
1112         prot = sk->prot;
1113         while(len > 0) 
1114         {
1115                 if (sk->err) 
1116                 {                       /* Stop on an error */
1117                         release_sock(sk);
1118                         if (copied) 
1119                                 return(copied);
1120                         tmp = -sk->err;
1121                         sk->err = 0;
1122                         return(tmp);
1123                 }
1124 
1125         /*
1126          *      First thing we do is make sure that we are established. 
1127          */
1128         
1129                 if (sk->shutdown & SEND_SHUTDOWN) 
1130                 {
1131                         release_sock(sk);
1132                         sk->err = EPIPE;
1133                         if (copied) 
1134                                 return(copied);
1135                         sk->err = 0;
1136                         return(-EPIPE);
1137                 }
1138 
1139 
1140         /* 
1141          *      Wait for a connection to finish.
1142          */
1143         
1144                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1145                 {
1146                         if (sk->err) 
1147                         {
1148                                 release_sock(sk);
1149                                 if (copied) 
1150                                         return(copied);
1151                                 tmp = -sk->err;
1152                                 sk->err = 0;
1153                                 return(tmp);
1154                         }
1155 
1156                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1157                         {
1158                                 release_sock(sk);
1159                                 if (copied) 
1160                                         return(copied);
1161 
1162                                 if (sk->err) 
1163                                 {
1164                                         tmp = -sk->err;
1165                                         sk->err = 0;
1166                                         return(tmp);
1167                                 }
1168 
1169                                 if (sk->keepopen) 
1170                                 {
1171                                         send_sig(SIGPIPE, current, 0);
1172                                 }
1173                                 return(-EPIPE);
1174                         }
1175 
1176                         if (nonblock || copied) 
1177                         {
1178                                 release_sock(sk);
1179                                 if (copied) 
1180                                         return(copied);
1181                                 return(-EAGAIN);
1182                         }
1183 
1184                         release_sock(sk);
1185                         cli();
1186                 
1187                         if (sk->state != TCP_ESTABLISHED &&
1188                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1189                         {
1190                                 interruptible_sleep_on(sk->sleep);
1191                                 if (current->signal & ~current->blocked) 
1192                                 {
1193                                         sti();
1194                                         if (copied) 
1195                                                 return(copied);
1196                                         return(-ERESTARTSYS);
1197                                 }
1198                         }
1199                         sk->inuse = 1;
1200                         sti();
1201                 }
1202 
1203         /*
1204          * The following code can result in copy <= if sk->mss is ever
1205          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1206          * sk->mtu is constant once SYN processing is finished.  I.e. we
1207          * had better not get here until we've seen his SYN and at least one
1208          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1209          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1210          * non-decreasing.  Note that any ioctl to set user_mss must be done
1211          * before the exchange of SYN's.  If the initial ack from the other
1212          * end has a window of 0, max_window and thus mss will both be 0.
1213          */
1214 
1215         /* 
1216          *      Now we need to check if we have a half built packet. 
1217          */
1218 
1219                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1220                 {
1221                         int hdrlen;
1222 
1223                          /* IP header + TCP header */
1224                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1225                                  + sizeof(struct tcphdr);
1226         
1227                         /* Add more stuff to the end of skb->len */
1228                         if (!(flags & MSG_OOB)) 
1229                         {
1230                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1231                                 /* FIXME: this is really a bug. */
1232                                 if (copy <= 0) 
1233                                 {
1234                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1235                                         copy = 0;
1236                                 }
1237           
1238                                 memcpy_fromfs(skb->data + skb->len, from, copy);
1239                                 skb->len += copy;
1240                                 from += copy;
1241                                 copied += copy;
1242                                 len -= copy;
1243                                 sk->write_seq += copy;
1244                         }
1245                         if ((skb->len - hdrlen) >= sk->mss ||
1246                                 (flags & MSG_OOB) || !sk->packets_out)
1247                                 tcp_send_skb(sk, skb);
1248                         else
1249                                 tcp_enqueue_partial(skb, sk);
1250                         continue;
1251                 }
1252 
1253         /*
1254          * We also need to worry about the window.
1255          * If window < 1/2 the maximum window we've seen from this
1256          *   host, don't use it.  This is sender side
1257          *   silly window prevention, as specified in RFC1122.
1258          *   (Note that this is different than earlier versions of
1259          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1260          *   use the whole MSS.  Since the results in the right
1261          *   edge of the packet being outside the window, it will
1262          *   be queued for later rather than sent.
1263          */
1264 
1265                 copy = sk->window_seq - sk->write_seq;
1266                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1267                         copy = sk->mss;
1268                 if (copy > len)
1269                         copy = len;
1270 
1271         /*
1272          *      We should really check the window here also. 
1273          */
1274          
1275                 send_tmp = NULL;
1276                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1277                 {
1278                         /*
1279                          *      We will release the socket incase we sleep here. 
1280                          */
1281                         release_sock(sk);
1282                         /*
1283                          *      NB: following must be mtu, because mss can be increased.
1284                          *      mss is always <= mtu 
1285                          */
1286                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1287                         sk->inuse = 1;
1288                         send_tmp = skb;
1289                 } 
1290                 else 
1291                 {
1292                         /*
1293                          *      We will release the socket incase we sleep here. 
1294                          */
1295                         release_sock(sk);
1296                         skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1297                         sk->inuse = 1;
1298                 }
1299 
1300                 /*
1301                  *      If we didn't get any memory, we need to sleep. 
1302                  */
1303 
1304                 if (skb == NULL) 
1305                 {
1306                         if (nonblock) 
1307                         {
1308                                 release_sock(sk);
1309                                 if (copied) 
1310                                         return(copied);
1311                                 return(-EAGAIN);
1312                         }
1313 
1314                         /*
1315                          *      FIXME: here is another race condition. 
1316                          */
1317 
1318                         tmp = sk->wmem_alloc;
1319                         release_sock(sk);
1320                         cli();
1321                         /*
1322                          *      Again we will try to avoid it. 
1323                          */
1324                         if (tmp <= sk->wmem_alloc &&
1325                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1326                                 && sk->err == 0) 
1327                         {
1328                                 interruptible_sleep_on(sk->sleep);
1329                                 if (current->signal & ~current->blocked) 
1330                                 {
1331                                         sti();
1332                                         if (copied) 
1333                                                 return(copied);
1334                                         return(-ERESTARTSYS);
1335                                 }
1336                         }
1337                         sk->inuse = 1;
1338                         sti();
1339                         continue;
1340                 }
1341 
1342                 skb->len = 0;
1343                 skb->sk = sk;
1344                 skb->free = 0;
1345                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1346         
1347                 buff = skb->data;
1348         
1349                 /*
1350                  * FIXME: we need to optimize this.
1351                  * Perhaps some hints here would be good.
1352                  */
1353                 
1354                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1355                                  IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1356                 if (tmp < 0 ) 
1357                 {
1358                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1359                         release_sock(sk);
1360                         if (copied) 
1361                                 return(copied);
1362                         return(tmp);
1363                 }
1364                 skb->len += tmp;
1365                 skb->dev = dev;
1366                 buff += tmp;
1367                 skb->h.th =(struct tcphdr *) buff;
1368                 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1369                 if (tmp < 0) 
1370                 {
1371                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1372                         release_sock(sk);
1373                         if (copied) 
1374                                 return(copied);
1375                         return(tmp);
1376                 }
1377 
1378                 if (flags & MSG_OOB) 
1379                 {
1380                         ((struct tcphdr *)buff)->urg = 1;
1381                         ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1382                 }
1383                 skb->len += tmp;
1384                 memcpy_fromfs(buff+tmp, from, copy);
1385 
1386                 from += copy;
1387                 copied += copy;
1388                 len -= copy;
1389                 skb->len += copy;
1390                 skb->free = 0;
1391                 sk->write_seq += copy;
1392         
1393                 if (send_tmp != NULL && sk->packets_out) 
1394                 {
1395                         tcp_enqueue_partial(send_tmp, sk);
1396                         continue;
1397                 }
1398                 tcp_send_skb(sk, skb);
1399         }
1400         sk->err = 0;
1401 
1402 /*
1403  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1404  *      interactive fast network servers. It's meant to be on and
1405  *      it really improves the throughput though not the echo time
1406  *      on my slow slip link - Alan
1407  */
1408 
1409 /*
1410  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1411  */
1412  
1413         if(sk->partial && ((!sk->packets_out) 
1414      /* If not nagling we can send on the before case too.. */
1415               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1416         ))
1417                 tcp_send_partial(sk);
1418 
1419         release_sock(sk);
1420         return(copied);
1421 }
1422 
1423 
1424 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1425            int len, int nonblock, unsigned flags,
1426            struct sockaddr_in *addr, int addr_len)
1427 {
1428         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1429                 return -EINVAL;
1430         if (sk->state == TCP_CLOSE)
1431                 return -ENOTCONN;
1432         if (addr_len < sizeof(*addr))
1433                 return -EINVAL;
1434         if (addr->sin_family && addr->sin_family != AF_INET) 
1435                 return -EINVAL;
1436         if (addr->sin_port != sk->dummy_th.dest) 
1437                 return -EISCONN;
1438         if (addr->sin_addr.s_addr != sk->daddr) 
1439                 return -EISCONN;
1440         return tcp_write(sk, from, len, nonblock, flags);
1441 }
1442 
1443 
1444 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1445 {
1446         int tmp;
1447         struct device *dev = NULL;
1448         struct tcphdr *t1;
1449         struct sk_buff *buff;
1450 
1451         if (!sk->ack_backlog) 
1452                 return;
1453 
1454         /*
1455          * FIXME: we need to put code here to prevent this routine from
1456          * being called.  Being called once in a while is ok, so only check
1457          * if this is the second time in a row.
1458          */
1459 
1460         /*
1461          * We need to grab some memory, and put together an ack,
1462          * and then put it into the queue to be sent.
1463          */
1464 
1465         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1466         if (buff == NULL) 
1467         {
1468                 /* Try again real soon. */
1469                 reset_timer(sk, TIME_WRITE, 10);
1470                 return;
1471         }
1472 
1473         buff->len = sizeof(struct tcphdr);
1474         buff->sk = sk;
1475         buff->localroute = sk->localroute;
1476         
1477         /*
1478          *      Put in the IP header and routing stuff. 
1479          */
1480 
1481         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1482                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1483         if (tmp < 0) 
1484         {
1485                 buff->free=1;
1486                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1487                 return;
1488         }
1489 
1490         buff->len += tmp;
1491         t1 =(struct tcphdr *)(buff->data +tmp);
1492 
1493         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1494         t1->seq = htonl(sk->sent_seq);
1495         t1->ack = 1;
1496         t1->res1 = 0;
1497         t1->res2 = 0;
1498         t1->rst = 0;
1499         t1->urg = 0;
1500         t1->syn = 0;
1501         t1->psh = 0;
1502         sk->ack_backlog = 0;
1503         sk->bytes_rcv = 0;
1504         sk->window = tcp_select_window(sk);
1505         t1->window = ntohs(sk->window);
1506         t1->ack_seq = ntohl(sk->acked_seq);
1507         t1->doff = sizeof(*t1)/4;
1508         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1509         sk->prot->queue_xmit(sk, dev, buff, 1);
1510         tcp_statistics.TcpOutSegs++;
1511 }
1512 
1513 
1514 /*
1515  *      FIXME:
1516  *      This routine frees used buffers.
1517  *      It should consider sending an ACK to let the
1518  *      other end know we now have a bigger window.
1519  */
1520 
1521 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1522 {
1523         unsigned long flags;
1524         unsigned long left;
1525         struct sk_buff *skb;
1526         unsigned long rspace;
1527 
1528         if(sk->debug)
1529                 printk("cleaning rbuf for sk=%p\n", sk);
1530   
1531         save_flags(flags);
1532         cli();
1533   
1534         left = sk->prot->rspace(sk);
1535  
1536         /*
1537          * We have to loop through all the buffer headers,
1538          * and try to free up all the space we can.
1539          */
1540 
1541         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1542         {
1543                 if (!skb->used) 
1544                         break;
1545                 skb_unlink(skb);
1546                 skb->sk = sk;
1547                 kfree_skb(skb, FREE_READ);
1548         }
1549 
1550         restore_flags(flags);
1551 
1552         /*
1553          * FIXME:
1554          * At this point we should send an ack if the difference
1555          * in the window, and the amount of space is bigger than
1556          * TCP_WINDOW_DIFF.
1557          */
1558 
1559         if(sk->debug)
1560                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1561                                             left);
1562         if ((rspace=sk->prot->rspace(sk)) != left) 
1563         {
1564                 /*
1565                  * This area has caused the most trouble.  The current strategy
1566                  * is to simply do nothing if the other end has room to send at
1567                  * least 3 full packets, because the ack from those will auto-
1568                  * matically update the window.  If the other end doesn't think
1569                  * we have much space left, but we have room for at least 1 more
1570                  * complete packet than it thinks we do, we will send an ack
1571                  * immediately.  Otherwise we will wait up to .5 seconds in case
1572                  * the user reads some more.
1573                  */
1574                 sk->ack_backlog++;
1575         /*
1576          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1577          * if the other end is offering a window smaller than the agreed on MSS
1578          * (called sk->mtu here).  In theory there's no connection between send
1579          * and receive, and so no reason to think that they're going to send
1580          * small packets.  For the moment I'm using the hack of reducing the mss
1581          * only on the send side, so I'm putting mtu here.
1582          */
1583 
1584                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1585                 {
1586                         /* Send an ack right now. */
1587                         tcp_read_wakeup(sk);
1588                 } 
1589                 else 
1590                 {
1591                         /* Force it to send an ack soon. */
1592                         int was_active = del_timer(&sk->timer);
1593                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1594                         {
1595                                 reset_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1596                         } 
1597                         else
1598                                 add_timer(&sk->timer);
1599                 }
1600         }
1601 } 
1602 
1603 
1604 /*
1605  *      Handle reading urgent data. 
1606  */
1607  
1608 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1609              unsigned char *to, int len, unsigned flags)
1610 {
1611         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1612                 return -EINVAL;
1613         if (sk->err) 
1614         {
1615                 int tmp = -sk->err;
1616                 sk->err = 0;
1617                 return tmp;
1618         }
1619 
1620         if (sk->state == TCP_CLOSE || sk->done) 
1621         {
1622                 if (!sk->done) {
1623                         sk->done = 1;
1624                         return 0;
1625                 }
1626                 return -ENOTCONN;
1627         }
1628 
1629         if (sk->shutdown & RCV_SHUTDOWN) 
1630         {
1631                 sk->done = 1;
1632                 return 0;
1633         }
1634         sk->inuse = 1;
1635         if (sk->urg_data & URG_VALID) 
1636         {
1637                 char c = sk->urg_data;
1638                 if (!(flags & MSG_PEEK))
1639                         sk->urg_data = URG_READ;
1640                 put_fs_byte(c, to);
1641                 release_sock(sk);
1642                 return 1;
1643         }
1644         release_sock(sk);
1645         
1646         /*
1647          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1648          * the available implementations agree in this case:
1649          * this call should never block, independent of the
1650          * blocking state of the socket.
1651          * Mike <pall@rz.uni-karlsruhe.de>
1652          */
1653         return -EAGAIN;
1654 }
1655 
1656 
1657 /*
1658  *      This routine copies from a sock struct into the user buffer. 
1659  */
1660  
1661 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
1662         int len, int nonblock, unsigned flags)
1663 {
1664         struct wait_queue wait = { current, NULL };
1665         int copied = 0;
1666         unsigned long peek_seq;
1667         unsigned long *seq;
1668         unsigned long used;
1669 
1670         /* This error should be checked. */
1671         if (sk->state == TCP_LISTEN)
1672                 return -ENOTCONN;
1673 
1674         /* Urgent data needs to be handled specially. */
1675         if (flags & MSG_OOB)
1676                 return tcp_read_urg(sk, nonblock, to, len, flags);
1677 
1678         peek_seq = sk->copied_seq;
1679         seq = &sk->copied_seq;
1680         if (flags & MSG_PEEK)
1681                 seq = &peek_seq;
1682 
1683         add_wait_queue(sk->sleep, &wait);
1684         sk->inuse = 1;
1685         while (len > 0) 
1686         {
1687                 struct sk_buff * skb;
1688                 unsigned long offset;
1689         
1690                 /*
1691                  * are we at urgent data? Stop if we have read anything.
1692                  */
1693                 if (copied && sk->urg_data && sk->urg_seq == *seq)
1694                         break;
1695 
1696                 current->state = TASK_INTERRUPTIBLE;
1697 
1698                 skb = skb_peek(&sk->receive_queue);
1699                 do 
1700                 {
1701                         if (!skb)
1702                                 break;
1703                         if (before(*seq, skb->h.th->seq))
1704                                 break;
1705                         offset = *seq - skb->h.th->seq;
1706                         if (skb->h.th->syn)
1707                                 offset--;
1708                         if (offset < skb->len)
1709                                 goto found_ok_skb;
1710                         if (skb->h.th->fin)
1711                                 goto found_fin_ok;
1712                         if (!(flags & MSG_PEEK))
1713                                 skb->used = 1;
1714                         skb = skb->next;
1715                 }
1716                 while (skb != (struct sk_buff *)&sk->receive_queue);
1717 
1718                 if (copied)
1719                         break;
1720 
1721                 if (sk->err) 
1722                 {
1723                         copied = -sk->err;
1724                         sk->err = 0;
1725                         break;
1726                 }
1727 
1728                 if (sk->state == TCP_CLOSE) 
1729                 {
1730                         if (!sk->done) 
1731                         {
1732                                 sk->done = 1;
1733                                 break;
1734                         }
1735                         copied = -ENOTCONN;
1736                         break;
1737                 }
1738 
1739                 if (sk->shutdown & RCV_SHUTDOWN) 
1740                 {
1741                         sk->done = 1;
1742                         break;
1743                 }
1744                         
1745                 if (nonblock) 
1746                 {
1747                         copied = -EAGAIN;
1748                         break;
1749                 }
1750 
1751                 cleanup_rbuf(sk);
1752                 release_sock(sk);
1753                 schedule();
1754                 sk->inuse = 1;
1755 
1756                 if (current->signal & ~current->blocked) 
1757                 {
1758                         copied = -ERESTARTSYS;
1759                         break;
1760                 }
1761                 continue;
1762 
1763         found_ok_skb:
1764                 /* Ok so how much can we use ? */
1765                 used = skb->len - offset;
1766                 if (len < used)
1767                         used = len;
1768                 /* do we have urgent data here? */
1769                 if (sk->urg_data) 
1770                 {
1771                         unsigned long urg_offset = sk->urg_seq - *seq;
1772                         if (urg_offset < used) 
1773                         {
1774                                 if (!urg_offset) 
1775                                 {
1776                                         if (!sk->urginline) 
1777                                         {
1778                                                 ++*seq;
1779                                                 offset++;
1780                                                 used--;
1781                                         }
1782                                 }
1783                                 else
1784                                         used = urg_offset;
1785                         }
1786                 }
1787                 /* Copy it */
1788                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
1789                         skb->h.th->doff*4 + offset, used);
1790                 copied += used;
1791                 len -= used;
1792                 to += used;
1793                 *seq += used;
1794                 if (after(sk->copied_seq,sk->urg_seq))
1795                         sk->urg_data = 0;
1796                 if (used + offset < skb->len)
1797                         continue;
1798                 if (skb->h.th->fin)
1799                         goto found_fin_ok;
1800                 if (flags & MSG_PEEK)
1801                         continue;
1802                 skb->used = 1;
1803                 continue;
1804 
1805         found_fin_ok:
1806                 ++*seq;
1807                 if (flags & MSG_PEEK)
1808                         break;
1809                 skb->used = 1;
1810                 sk->shutdown |= RCV_SHUTDOWN;
1811                 break;
1812 
1813         }
1814         remove_wait_queue(sk->sleep, &wait);
1815         current->state = TASK_RUNNING;
1816 
1817         /* Clean up data we have read: This will do ACK frames */
1818         cleanup_rbuf(sk);
1819         release_sock(sk);
1820         return copied;
1821 }
1822 
1823  
1824 /*
1825  *      Shutdown the sending side of a connection.
1826  */
1827 
1828 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
1829 {
1830         struct sk_buff *buff;
1831         struct tcphdr *t1, *th;
1832         struct proto *prot;
1833         int tmp;
1834         struct device *dev = NULL;
1835 
1836         /*
1837          * We need to grab some memory, and put together a FIN,
1838          * and then put it into the queue to be sent.
1839          * FIXME:
1840          *
1841          *      Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1842          *      Most of this is guesswork, so maybe it will work...
1843          */
1844 
1845         if (!(how & SEND_SHUTDOWN)) 
1846                 return;
1847          
1848         /*
1849          *      If we've already sent a FIN, return. 
1850          */
1851          
1852         if (sk->state == TCP_FIN_WAIT1 ||
1853             sk->state == TCP_FIN_WAIT2 ||
1854             sk->state == TCP_CLOSING ||
1855             sk->state == TCP_LAST_ACK ||
1856             sk->state == TCP_TIME_WAIT
1857         ) 
1858         {
1859                 return;
1860         }
1861         sk->inuse = 1;
1862 
1863         /*
1864          * flag that the sender has shutdown
1865          */
1866 
1867         sk->shutdown |= SEND_SHUTDOWN;
1868 
1869         /*
1870          *  Clear out any half completed packets. 
1871          */
1872 
1873         if (sk->partial)
1874                 tcp_send_partial(sk);
1875 
1876         prot =(struct proto *)sk->prot;
1877         th =(struct tcphdr *)&sk->dummy_th;
1878         release_sock(sk); /* in case the malloc sleeps. */
1879         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
1880         if (buff == NULL)
1881                 return;
1882         sk->inuse = 1;
1883 
1884         buff->sk = sk;
1885         buff->len = sizeof(*t1);
1886         buff->localroute = sk->localroute;
1887         t1 =(struct tcphdr *) buff->data;
1888 
1889         /*
1890          *      Put in the IP header and routing stuff. 
1891          */
1892 
1893         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
1894                            IPPROTO_TCP, sk->opt,
1895                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
1896         if (tmp < 0) 
1897         {
1898                 /*
1899                  *      Finish anyway, treat this as a send that got lost. 
1900                  *
1901                  *      Enter FIN_WAIT1 on normal shutdown, which waits for
1902                  *      written data to be completely acknowledged along
1903                  *      with an acknowledge to our FIN.
1904                  *
1905                  *      Enter FIN_WAIT2 on abnormal shutdown -- close before
1906                  *      connection established.
1907                  */
1908                 buff->free=1;
1909                 prot->wfree(sk,buff->mem_addr, buff->mem_len);
1910 
1911                 if (sk->state == TCP_ESTABLISHED)
1912                         tcp_set_state(sk,TCP_FIN_WAIT1);
1913                 else if(sk->state == TCP_CLOSE_WAIT)
1914                         tcp_set_state(sk,TCP_LAST_ACK);
1915                 else
1916                         tcp_set_state(sk,TCP_FIN_WAIT2);
1917 
1918                 release_sock(sk);
1919                 return;
1920         }
1921 
1922         t1 =(struct tcphdr *)((char *)t1 +tmp);
1923         buff->len += tmp;
1924         buff->dev = dev;
1925         memcpy(t1, th, sizeof(*t1));
1926         t1->seq = ntohl(sk->write_seq);
1927         sk->write_seq++;
1928         buff->h.seq = sk->write_seq;
1929         t1->ack = 1;
1930         t1->ack_seq = ntohl(sk->acked_seq);
1931         t1->window = ntohs(sk->window=tcp_select_window(sk));
1932         t1->fin = 1;
1933         t1->rst = 0;
1934         t1->doff = sizeof(*t1)/4;
1935         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1936 
1937         /*
1938          * If there is data in the write queue, the fin must be appended to
1939          * the write queue.
1940          */
1941         
1942         if (skb_peek(&sk->write_queue) != NULL) 
1943         {
1944                 buff->free=0;
1945                 if (buff->next != NULL) 
1946                 {
1947                         printk("tcp_shutdown: next != NULL\n");
1948                         skb_unlink(buff);
1949                 }
1950                 skb_queue_tail(&sk->write_queue, buff);
1951         } 
1952         else 
1953         {
1954                 sk->sent_seq = sk->write_seq;
1955                 sk->prot->queue_xmit(sk, dev, buff, 0);
1956         }
1957 
1958         if (sk->state == TCP_ESTABLISHED) 
1959                 tcp_set_state(sk,TCP_FIN_WAIT1);
1960         else if (sk->state == TCP_CLOSE_WAIT)
1961                 tcp_set_state(sk,TCP_LAST_ACK);
1962         else
1963                 tcp_set_state(sk,TCP_FIN_WAIT2);
1964 
1965         release_sock(sk);
1966 }
1967 
1968 
1969 static int
1970 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
1971              int to_len, int nonblock, unsigned flags,
1972              struct sockaddr_in *addr, int *addr_len)
1973 {
1974         int result;
1975   
1976         /* 
1977          *      Have to check these first unlike the old code. If 
1978          *      we check them after we lose data on an error
1979          *      which is wrong 
1980          */
1981 
1982         if(addr_len)
1983                 *addr_len = sizeof(*addr);
1984         result=tcp_read(sk, to, to_len, nonblock, flags);
1985 
1986         if (result < 0) 
1987                 return(result);
1988   
1989         if(addr)
1990         {
1991                 addr->sin_family = AF_INET;
1992                 addr->sin_port = sk->dummy_th.dest;
1993                 addr->sin_addr.s_addr = sk->daddr;
1994         }
1995         return(result);
1996 }
1997 
1998 
1999 /*
2000  *      This routine will send an RST to the other tcp. 
2001  */
2002  
2003 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2004           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2005 {
2006         struct sk_buff *buff;
2007         struct tcphdr *t1;
2008         int tmp;
2009         struct device *ndev=NULL;
2010 
2011         /*
2012          *      Cannot reset a reset (Think about it).
2013          */
2014          
2015         if(th->rst)
2016                 return;
2017   
2018         /*
2019          * We need to grab some memory, and put together an RST,
2020          * and then put it into the queue to be sent.
2021          */
2022 
2023         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2024         if (buff == NULL) 
2025                 return;
2026 
2027         buff->len = sizeof(*t1);
2028         buff->sk = NULL;
2029         buff->dev = dev;
2030         buff->localroute = 0;
2031 
2032         t1 =(struct tcphdr *) buff->data;
2033 
2034         /*
2035          *      Put in the IP header and routing stuff. 
2036          */
2037 
2038         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2039                            sizeof(struct tcphdr),tos,ttl);
2040         if (tmp < 0) 
2041         {
2042                 buff->free = 1;
2043                 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2044                 return;
2045         }
2046 
2047         t1 =(struct tcphdr *)((char *)t1 +tmp);
2048         buff->len += tmp;
2049         memcpy(t1, th, sizeof(*t1));
2050 
2051         /*
2052          *      Swap the send and the receive. 
2053          */
2054 
2055         t1->dest = th->source;
2056         t1->source = th->dest;
2057         t1->rst = 1;  
2058         t1->window = 0;
2059   
2060         if(th->ack)
2061         {
2062                 t1->ack = 0;
2063                 t1->seq = th->ack_seq;
2064                 t1->ack_seq = 0;
2065         }
2066         else
2067         {
2068                 t1->ack = 1;
2069                 if(!th->syn)
2070                         t1->ack_seq=htonl(th->seq);
2071                 else
2072                         t1->ack_seq=htonl(th->seq+1);
2073                 t1->seq=0;
2074         }
2075 
2076         t1->syn = 0;
2077         t1->urg = 0;
2078         t1->fin = 0;
2079         t1->psh = 0;
2080         t1->doff = sizeof(*t1)/4;
2081         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2082         prot->queue_xmit(NULL, ndev, buff, 1);
2083         tcp_statistics.TcpOutSegs++;
2084 }
2085 
2086 
2087 /*
2088  *      Look for tcp options. Parses everything but only knows about MSS.
2089  *      This routine is always called with the packet containing the SYN.
2090  *      However it may also be called with the ack to the SYN.  So you
2091  *      can't assume this is always the SYN.  It's always called after
2092  *      we have set up sk->mtu to our own MTU.
2093  *
2094  *      We need at minimum to add PAWS support here. Possibly large windows
2095  *      as Linux gets deployed on 100Mb/sec networks.
2096  */
2097  
2098 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2099 {
2100         unsigned char *ptr;
2101         int length=(th->doff*4)-sizeof(struct tcphdr);
2102         int mss_seen = 0;
2103     
2104         ptr = (unsigned char *)(th + 1);
2105   
2106         while(length>0)
2107         {
2108                 int opcode=*ptr++;
2109                 int opsize=*ptr++;
2110                 switch(opcode)
2111                 {
2112                         case TCPOPT_EOL:
2113                                 return;
2114                         case TCPOPT_NOP:
2115                                 length-=2;
2116                                 continue;
2117                         
2118                         default:
2119                                 if(opsize<=2)   /* Avoid silly options looping forever */
2120                                         return;
2121                                 switch(opcode)
2122                                 {
2123                                         case TCPOPT_MSS:
2124                                                 if(opsize==4 && th->syn)
2125                                                 {
2126                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2127                                                         mss_seen = 1;
2128                                                 }
2129                                                 break;
2130                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2131                                 }
2132                                 ptr+=opsize-2;
2133                                 length-=opsize;
2134                 }
2135         }
2136         if (th->syn) 
2137         {
2138                 if (! mss_seen)
2139                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2140         }
2141 #ifdef CONFIG_INET_PCTCP
2142         sk->mss = min(sk->max_window >> 1, sk->mtu);
2143 #else    
2144         sk->mss = min(sk->max_window, sk->mtu);
2145 #endif  
2146 }
2147 
2148 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2149 {
2150         dst = ntohl(dst);
2151         if (IN_CLASSA(dst))
2152                 return htonl(IN_CLASSA_NET);
2153         if (IN_CLASSB(dst))
2154                 return htonl(IN_CLASSB_NET);
2155         return htonl(IN_CLASSC_NET);
2156 }
2157 
2158 /*
2159  *      Default sequence number picking algorithm.
2160  */
2161 
2162 extern inline long tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2163 {
2164         return jiffies * SEQ_TICK - seq_offset; 
2165 }
2166 
2167 /*
2168  *      This routine handles a connection request.
2169  *      It should make sure we haven't already responded.
2170  *      Because of the way BSD works, we have to send a syn/ack now.
2171  *      This also means it will be harder to close a socket which is
2172  *      listening.
2173  */
2174  
2175 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2176                  unsigned long daddr, unsigned long saddr,
2177                  struct options *opt, struct device *dev, unsigned long seq)
2178 {
2179         struct sk_buff *buff;
2180         struct tcphdr *t1;
2181         unsigned char *ptr;
2182         struct sock *newsk;
2183         struct tcphdr *th;
2184         struct device *ndev=NULL;
2185         int tmp;
2186         struct rtable *rt;
2187   
2188         th = skb->h.th;
2189 
2190         /* If the socket is dead, don't accept the connection. */
2191         if (!sk->dead) 
2192         {
2193                 sk->data_ready(sk,0);
2194         }
2195         else 
2196         {
2197                 if(sk->debug)
2198                         printk("Reset on %p: Connect on dead socket.\n",sk);
2199                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2200                 tcp_statistics.TcpAttemptFails++;
2201                 kfree_skb(skb, FREE_READ);
2202                 return;
2203         }
2204 
2205         /*
2206          * Make sure we can accept more.  This will prevent a
2207          * flurry of syns from eating up all our memory.
2208          */
2209 
2210         if (sk->ack_backlog >= sk->max_ack_backlog) 
2211         {
2212                 tcp_statistics.TcpAttemptFails++;
2213                 kfree_skb(skb, FREE_READ);
2214                 return;
2215         }
2216 
2217         /*
2218          * We need to build a new sock struct.
2219          * It is sort of bad to have a socket without an inode attached
2220          * to it, but the wake_up's will just wake up the listening socket,
2221          * and if the listening socket is destroyed before this is taken
2222          * off of the queue, this will take care of it.
2223          */
2224 
2225         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2226         if (newsk == NULL) 
2227         {
2228                 /* just ignore the syn.  It will get retransmitted. */
2229                 tcp_statistics.TcpAttemptFails++;
2230                 kfree_skb(skb, FREE_READ);
2231                 return;
2232         }
2233 
2234         memcpy(newsk, sk, sizeof(*newsk));
2235         skb_queue_head_init(&newsk->write_queue);
2236         skb_queue_head_init(&newsk->receive_queue);
2237         newsk->send_head = NULL;
2238         newsk->send_tail = NULL;
2239         skb_queue_head_init(&newsk->back_log);
2240         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2241         newsk->rto = TCP_TIMEOUT_INIT;
2242         newsk->mdev = 0;
2243         newsk->max_window = 0;
2244         newsk->cong_window = 1;
2245         newsk->cong_count = 0;
2246         newsk->ssthresh = 0;
2247         newsk->backoff = 0;
2248         newsk->blog = 0;
2249         newsk->intr = 0;
2250         newsk->proc = 0;
2251         newsk->done = 0;
2252         newsk->partial = NULL;
2253         newsk->pair = NULL;
2254         newsk->wmem_alloc = 0;
2255         newsk->rmem_alloc = 0;
2256         newsk->localroute = sk->localroute;
2257 
2258         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2259 
2260         newsk->err = 0;
2261         newsk->shutdown = 0;
2262         newsk->ack_backlog = 0;
2263         newsk->acked_seq = skb->h.th->seq+1;
2264         newsk->copied_seq = skb->h.th->seq+1;
2265         newsk->fin_seq = skb->h.th->seq;
2266         newsk->state = TCP_SYN_RECV;
2267         newsk->timeout = 0;
2268         newsk->write_seq = seq; 
2269         newsk->window_seq = newsk->write_seq;
2270         newsk->rcv_ack_seq = newsk->write_seq;
2271         newsk->urg_data = 0;
2272         newsk->retransmits = 0;
2273         newsk->linger=0;
2274         newsk->destroy = 0;
2275         init_timer(&newsk->timer);
2276         newsk->timer.data = (unsigned long)newsk;
2277         newsk->timer.function = &net_timer;
2278         newsk->dummy_th.source = skb->h.th->dest;
2279         newsk->dummy_th.dest = skb->h.th->source;
2280         
2281         /*
2282          *      Swap these two, they are from our point of view. 
2283          */
2284          
2285         newsk->daddr = saddr;
2286         newsk->saddr = daddr;
2287 
2288         put_sock(newsk->num,newsk);
2289         newsk->dummy_th.res1 = 0;
2290         newsk->dummy_th.doff = 6;
2291         newsk->dummy_th.fin = 0;
2292         newsk->dummy_th.syn = 0;
2293         newsk->dummy_th.rst = 0;        
2294         newsk->dummy_th.psh = 0;
2295         newsk->dummy_th.ack = 0;
2296         newsk->dummy_th.urg = 0;
2297         newsk->dummy_th.res2 = 0;
2298         newsk->acked_seq = skb->h.th->seq + 1;
2299         newsk->copied_seq = skb->h.th->seq + 1;
2300         newsk->socket = NULL;
2301 
2302         /*
2303          *      Grab the ttl and tos values and use them 
2304          */
2305 
2306         newsk->ip_ttl=sk->ip_ttl;
2307         newsk->ip_tos=skb->ip_hdr->tos;
2308 
2309         /*
2310          *      Use 512 or whatever user asked for 
2311          */
2312 
2313         /*
2314          *      Note use of sk->user_mss, since user has no direct access to newsk 
2315          */
2316 
2317         rt=ip_rt_route(saddr, NULL,NULL);
2318         
2319         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2320                 newsk->window_clamp = rt->rt_window;
2321         else
2322                 newsk->window_clamp = 0;
2323                 
2324         if (sk->user_mss)
2325                 newsk->mtu = sk->user_mss;
2326         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2327                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2328         else 
2329         {
2330 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2331                 if ((saddr ^ daddr) & default_mask(saddr))
2332 #else
2333                 if ((saddr ^ daddr) & dev->pa_mask)
2334 #endif
2335                         newsk->mtu = 576 - HEADER_SIZE;
2336                 else
2337                         newsk->mtu = MAX_WINDOW;
2338         }
2339 
2340         /*
2341          *      But not bigger than device MTU 
2342          */
2343 
2344         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2345 
2346         /*
2347          *      This will min with what arrived in the packet 
2348          */
2349 
2350         tcp_options(newsk,skb->h.th);
2351 
2352         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2353         if (buff == NULL) 
2354         {
2355                 sk->err = -ENOMEM;
2356                 newsk->dead = 1;
2357                 release_sock(newsk);
2358                 kfree_skb(skb, FREE_READ);
2359                 tcp_statistics.TcpAttemptFails++;
2360                 return;
2361         }
2362   
2363         buff->len = sizeof(struct tcphdr)+4;
2364         buff->sk = newsk;
2365         buff->localroute = newsk->localroute;
2366 
2367         t1 =(struct tcphdr *) buff->data;
2368 
2369         /*
2370          *      Put in the IP header and routing stuff. 
2371          */
2372 
2373         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2374                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2375 
2376         /*
2377          *      Something went wrong. 
2378          */
2379 
2380         if (tmp < 0) 
2381         {
2382                 sk->err = tmp;
2383                 buff->free=1;
2384                 kfree_skb(buff,FREE_WRITE);
2385                 newsk->dead = 1;
2386                 release_sock(newsk);
2387                 skb->sk = sk;
2388                 kfree_skb(skb, FREE_READ);
2389                 tcp_statistics.TcpAttemptFails++;
2390                 return;
2391         }
2392 
2393         buff->len += tmp;
2394         t1 =(struct tcphdr *)((char *)t1 +tmp);
2395   
2396         memcpy(t1, skb->h.th, sizeof(*t1));
2397         buff->h.seq = newsk->write_seq;
2398         /*
2399          *      Swap the send and the receive. 
2400          */
2401         t1->dest = skb->h.th->source;
2402         t1->source = newsk->dummy_th.source;
2403         t1->seq = ntohl(newsk->write_seq++);
2404         t1->ack = 1;
2405         newsk->window = tcp_select_window(newsk);
2406         newsk->sent_seq = newsk->write_seq;
2407         t1->window = ntohs(newsk->window);
2408         t1->res1 = 0;
2409         t1->res2 = 0;
2410         t1->rst = 0;
2411         t1->urg = 0;
2412         t1->psh = 0;
2413         t1->syn = 1;
2414         t1->ack_seq = ntohl(skb->h.th->seq+1);
2415         t1->doff = sizeof(*t1)/4+1;
2416         ptr =(unsigned char *)(t1+1);
2417         ptr[0] = 2;
2418         ptr[1] = 4;
2419         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2420         ptr[3] =(newsk->mtu) & 0xff;
2421 
2422         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2423         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2424 
2425         reset_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2426         skb->sk = newsk;
2427 
2428         /*
2429          *      Charge the sock_buff to newsk. 
2430          */
2431          
2432         sk->rmem_alloc -= skb->mem_len;
2433         newsk->rmem_alloc += skb->mem_len;
2434         
2435         skb_queue_tail(&sk->receive_queue,skb);
2436         sk->ack_backlog++;
2437         release_sock(newsk);
2438         tcp_statistics.TcpOutSegs++;
2439 }
2440 
2441 
2442 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
2443 {
2444         struct sk_buff *buff;
2445         struct tcphdr *t1, *th;
2446         struct proto *prot;
2447         struct device *dev=NULL;
2448         int tmp;
2449 
2450         /*
2451          * We need to grab some memory, and put together a FIN, 
2452          * and then put it into the queue to be sent.
2453          */
2454         sk->inuse = 1;
2455         sk->keepopen = 1;
2456         sk->shutdown = SHUTDOWN_MASK;
2457 
2458         if (!sk->dead) 
2459                 sk->state_change(sk);
2460 
2461         if (timeout == 0) 
2462         {
2463                 /*
2464                  *  We need to flush the recv. buffs.  We do this only on the
2465                  *  descriptor close, not protocol-sourced closes, because the
2466                  *  reader process may not have drained the data yet!
2467                  */
2468 
2469                 if (skb_peek(&sk->receive_queue) != NULL) 
2470                 {
2471                         struct sk_buff *skb;
2472                         if(sk->debug)
2473                                 printk("Clean rcv queue\n");
2474                         while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2475                                 kfree_skb(skb, FREE_READ);
2476                         if(sk->debug)
2477                                 printk("Cleaned.\n");
2478                 }
2479         }
2480 
2481         /*
2482          *      Get rid off any half-completed packets. 
2483          */
2484          
2485         if (sk->partial) 
2486         {
2487                 tcp_send_partial(sk);
2488         }
2489 
2490         switch(sk->state) 
2491         {
2492                 case TCP_FIN_WAIT1:
2493                 case TCP_FIN_WAIT2:
2494                 case TCP_CLOSING:
2495                         /*
2496                          * These states occur when we have already closed out
2497                          * our end.  If there is no timeout, we do not do
2498                          * anything.  We may still be in the middle of sending
2499                          * the remainder of our buffer, for example...
2500                          * resetting the timer would be inappropriate.
2501                          *
2502                          * XXX if retransmit count reaches limit, is tcp_close()
2503                          * called with timeout == 1 ? if not, we need to fix that.
2504                          */
2505                         if (!timeout) {
2506                                 int timer_active;
2507 
2508                                 timer_active = del_timer(&sk->timer);
2509                                 if (timer_active)
2510                                         add_timer(&sk->timer);
2511                                 else
2512                                         reset_timer(sk, TIME_CLOSE, 4 * sk->rto);
2513                         }
2514                         if (timeout) 
2515                                 tcp_time_wait(sk);
2516                         release_sock(sk);
2517                         return; /* break causes a double release - messy */
2518                 case TCP_TIME_WAIT:
2519                 case TCP_LAST_ACK:
2520                         /*
2521                          * A timeout from these states terminates the TCB.
2522                          */
2523                         if (timeout) 
2524                         {
2525                                 tcp_set_state(sk,TCP_CLOSE);
2526                         }
2527                         release_sock(sk);
2528                         return;
2529                 case TCP_LISTEN:
2530                         /* we need to drop any sockets which have been connected,
2531                            but have not yet been accepted. */
2532                         tcp_set_state(sk,TCP_CLOSE);
2533                         tcp_close_pending(sk, timeout);
2534                         release_sock(sk);
2535                         return;
2536                 case TCP_CLOSE:
2537                         release_sock(sk);
2538                         return;
2539                 case TCP_CLOSE_WAIT:
2540                 case TCP_ESTABLISHED:
2541                 case TCP_SYN_SENT:
2542                 case TCP_SYN_RECV:
2543                         prot =(struct proto *)sk->prot;
2544                         th =(struct tcphdr *)&sk->dummy_th;
2545                         buff = prot->wmalloc(sk, MAX_FIN_SIZE, 1, GFP_ATOMIC);
2546                         if (buff == NULL) 
2547                         {
2548                                 /* This will force it to try again later. */
2549                                 /* Or it would have if someone released the socket
2550                                    first. Anyway it might work now */
2551                                 release_sock(sk);
2552                                 if (sk->state != TCP_CLOSE_WAIT)
2553                                         tcp_set_state(sk,TCP_ESTABLISHED);
2554                                 reset_timer(sk, TIME_CLOSE, 100);
2555                                 return;
2556                         }
2557                         buff->sk = sk;
2558                         buff->free = 1;
2559                         buff->len = sizeof(*t1);
2560                         buff->localroute = sk->localroute;
2561                         t1 =(struct tcphdr *) buff->data;
2562         
2563                         /*
2564                          *      Put in the IP header and routing stuff. 
2565                          */
2566                         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2567                                          IPPROTO_TCP, sk->opt,
2568                                          sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2569                         if (tmp < 0) 
2570                         {
2571                                 sk->write_seq++;        /* Very important 8) */
2572                                 kfree_skb(buff,FREE_WRITE);
2573 
2574                                 /*
2575                                  * Enter FIN_WAIT1 to await completion of
2576                                  * written out data and ACK to our FIN.
2577                                  */
2578 
2579                                 if(sk->state==TCP_ESTABLISHED)
2580                                         tcp_set_state(sk,TCP_FIN_WAIT1);
2581                                 else
2582                                         tcp_set_state(sk,TCP_FIN_WAIT2);
2583                                 reset_timer(sk, TIME_CLOSE,4*sk->rto);
2584                                 if(timeout)
2585                                         tcp_time_wait(sk);
2586 
2587                                 release_sock(sk);
2588                                 return;
2589                         }
2590 
2591                         t1 =(struct tcphdr *)((char *)t1 +tmp);
2592                         buff->len += tmp;
2593                         buff->dev = dev;
2594                         memcpy(t1, th, sizeof(*t1));
2595                         t1->seq = ntohl(sk->write_seq);
2596                         sk->write_seq++;
2597                         buff->h.seq = sk->write_seq;
2598                         t1->ack = 1;
2599         
2600                         /* 
2601                          *      Ack everything immediately from now on. 
2602                          */
2603 
2604                         sk->delay_acks = 0;
2605                         t1->ack_seq = ntohl(sk->acked_seq);
2606                         t1->window = ntohs(sk->window=tcp_select_window(sk));
2607                         t1->fin = 1;
2608                         t1->rst = 0;
2609                         t1->doff = sizeof(*t1)/4;
2610                         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2611 
2612                         tcp_statistics.TcpOutSegs++;
2613         
2614                         if (skb_peek(&sk->write_queue) == NULL) 
2615                         {
2616                                 sk->sent_seq = sk->write_seq;
2617                                 prot->queue_xmit(sk, dev, buff, 0);
2618                         } 
2619                         else 
2620                         {
2621                                 reset_timer(sk, TIME_WRITE, sk->rto);
2622                                 if (buff->next != NULL) 
2623                                 {
2624                                         printk("tcp_close: next != NULL\n");
2625                                         skb_unlink(buff);
2626                                 }
2627                                 skb_queue_tail(&sk->write_queue, buff);
2628                         }
2629 
2630                         /*
2631                          * If established (normal close), enter FIN_WAIT1.
2632                          * If in CLOSE_WAIT, enter LAST_ACK
2633                          * If in CLOSING, remain in CLOSING
2634                          * otherwise enter FIN_WAIT2
2635                          */
2636 
2637                         if (sk->state == TCP_ESTABLISHED)
2638                                 tcp_set_state(sk,TCP_FIN_WAIT1);
2639                         else if (sk->state == TCP_CLOSE_WAIT)
2640                                 tcp_set_state(sk,TCP_LAST_ACK);
2641                         else if (sk->state != TCP_CLOSING)
2642                                 tcp_set_state(sk,TCP_FIN_WAIT2);
2643         }
2644         release_sock(sk);
2645 }
2646 
2647 
2648 /*
2649  * This routine takes stuff off of the write queue,
2650  * and puts it in the xmit queue.
2651  */
2652 static void
2653 tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2654 {
2655         struct sk_buff *skb;
2656 
2657         /*
2658          *      The bytes will have to remain here. In time closedown will
2659          *      empty the write queue and all will be happy 
2660          */
2661 
2662         if(sk->zapped)
2663                 return;
2664 
2665         while((skb = skb_peek(&sk->write_queue)) != NULL &&
2666                 before(skb->h.seq, sk->window_seq + 1) &&
2667                 (sk->retransmits == 0 ||
2668                  sk->timeout != TIME_WRITE ||
2669                  before(skb->h.seq, sk->rcv_ack_seq + 1))
2670                 && sk->packets_out < sk->cong_window) 
2671         {
2672                 IS_SKB(skb);
2673                 skb_unlink(skb);
2674                 /* See if we really need to send the packet. */
2675                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
2676                 {
2677                         sk->retransmits = 0;
2678                         kfree_skb(skb, FREE_WRITE);
2679                         if (!sk->dead) 
2680                                 sk->write_space(sk);
2681                 } 
2682                 else
2683                 {
2684                         struct tcphdr *th;
2685                         struct iphdr *iph;
2686                         int size;
2687 /*
2688  * put in the ack seq and window at this point rather than earlier,
2689  * in order to keep them monotonic.  We really want to avoid taking
2690  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
2691  * Ack and window will in general have changed since this packet was put
2692  * on the write queue.
2693  */
2694                         iph = (struct iphdr *)(skb->data +
2695                                                skb->dev->hard_header_len);
2696                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
2697                         size = skb->len - (((unsigned char *) th) - skb->data);
2698                         
2699                         th->ack_seq = ntohl(sk->acked_seq);
2700                         th->window = ntohs(tcp_select_window(sk));
2701 
2702                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
2703 
2704                         sk->sent_seq = skb->h.seq;
2705                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
2706                 }
2707         }
2708 }
2709 
2710 
2711 /*
2712  *      This routine deals with incoming acks, but not outgoing ones.
2713  */
2714 
2715 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
2716 {
2717         unsigned long ack;
2718         int flag = 0;
2719 
2720         /* 
2721          * 1 - there was data in packet as well as ack or new data is sent or 
2722          *     in shutdown state
2723          * 2 - data from retransmit queue was acked and removed
2724          * 4 - window shrunk or data from retransmit queue was acked and removed
2725          */
2726 
2727         if(sk->zapped)
2728                 return(1);      /* Dead, cant ack any more so why bother */
2729 
2730         ack = ntohl(th->ack_seq);
2731         if (ntohs(th->window) > sk->max_window) 
2732         {
2733                 sk->max_window = ntohs(th->window);
2734 #ifdef CONFIG_INET_PCTCP
2735                 sk->mss = min(sk->max_window>>1, sk->mtu);
2736 #else
2737                 sk->mss = min(sk->max_window, sk->mtu);
2738 #endif  
2739         }
2740 
2741         if (sk->retransmits && sk->timeout == TIME_KEEPOPEN)
2742                 sk->retransmits = 0;
2743 
2744         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
2745         {
2746                 if(sk->debug)
2747                         printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
2748                         
2749                 /*
2750                  *      Keepalive processing.
2751                  */
2752                  
2753                 if (after(ack, sk->sent_seq)) 
2754                 {
2755                         return(0);
2756                 }
2757                 if (sk->keepopen) 
2758                 {
2759                         if(sk->timeout==TIME_KEEPOPEN)
2760                                 reset_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
2761                 }
2762                 return(1);
2763         }
2764 
2765         if (len != th->doff*4) 
2766                 flag |= 1;
2767 
2768         /* See if our window has been shrunk. */
2769 
2770         if (after(sk->window_seq, ack+ntohs(th->window))) 
2771         {
2772                 /*
2773                  * We may need to move packets from the send queue
2774                  * to the write queue, if the window has been shrunk on us.
2775                  * The RFC says you are not allowed to shrink your window
2776                  * like this, but if the other end does, you must be able
2777                  * to deal with it.
2778                  */
2779                 struct sk_buff *skb;
2780                 struct sk_buff *skb2;
2781                 struct sk_buff *wskb = NULL;
2782         
2783                 skb2 = sk->send_head;
2784                 sk->send_head = NULL;
2785                 sk->send_tail = NULL;
2786         
2787                 /*
2788                  *      This is an artifact of a flawed concept. We want one
2789                  *      queue and a smarter send routine when we send all.
2790                  */
2791         
2792                 flag |= 4;
2793         
2794                 sk->window_seq = ack + ntohs(th->window);
2795                 cli();
2796                 while (skb2 != NULL) 
2797                 {
2798                         skb = skb2;
2799                         skb2 = skb->link3;
2800                         skb->link3 = NULL;
2801                         if (after(skb->h.seq, sk->window_seq)) 
2802                         {
2803                                 if (sk->packets_out > 0) 
2804                                         sk->packets_out--;
2805                                 /* We may need to remove this from the dev send list. */
2806                                 if (skb->next != NULL) 
2807                                 {
2808                                         skb_unlink(skb);                                
2809                                 }
2810                                 /* Now add it to the write_queue. */
2811                                 if (wskb == NULL)
2812                                         skb_queue_head(&sk->write_queue,skb);
2813                                 else
2814                                         skb_append(wskb,skb);
2815                                 wskb = skb;
2816                         } 
2817                         else 
2818                         {
2819                                 if (sk->send_head == NULL) 
2820                                 {
2821                                         sk->send_head = skb;
2822                                         sk->send_tail = skb;
2823                                 }
2824                                 else
2825                                 {
2826                                         sk->send_tail->link3 = skb;
2827                                         sk->send_tail = skb;
2828                                 }
2829                                 skb->link3 = NULL;
2830                         }
2831                 }
2832                 sti();
2833         }
2834 
2835         /*
2836          *      Pipe has emptied
2837          */
2838          
2839         if (sk->send_tail == NULL || sk->send_head == NULL) 
2840         {
2841                 sk->send_head = NULL;
2842                 sk->send_tail = NULL;
2843                 sk->packets_out= 0;
2844         }
2845 
2846         sk->window_seq = ack + ntohs(th->window);
2847 
2848         /* We don't want too many packets out there. */
2849         if (sk->timeout == TIME_WRITE && 
2850                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
2851         {
2852                 /* 
2853                  * This is Jacobson's slow start and congestion avoidance. 
2854                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
2855                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
2856                  * counter and increment it once every cwnd times.  It's possible
2857                  * that this should be done only if sk->retransmits == 0.  I'm
2858                  * interpreting "new data is acked" as including data that has
2859                  * been retransmitted but is just now being acked.
2860                  */
2861                 if (sk->cong_window < sk->ssthresh)  
2862                         /* 
2863                          *      In "safe" area, increase
2864                          */
2865                         sk->cong_window++;
2866                 else 
2867                 {
2868                         /*
2869                          *      In dangerous area, increase slowly.  In theory this is
2870                          *      sk->cong_window += 1 / sk->cong_window
2871                          */
2872                         if (sk->cong_count >= sk->cong_window) 
2873                         {
2874                                 sk->cong_window++;
2875                                 sk->cong_count = 0;
2876                         }
2877                         else 
2878                                 sk->cong_count++;
2879                 }
2880         }
2881 
2882         sk->rcv_ack_seq = ack;
2883 
2884         /*
2885          *      If this ack opens up a zero window, clear backoff.  It was
2886          *      being used to time the probes, and is probably far higher than
2887          *      it needs to be for normal retransmission.
2888          */
2889 
2890         if (sk->timeout == TIME_PROBE0) 
2891         {
2892                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
2893                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
2894                 {
2895                         sk->retransmits = 0;
2896                         sk->backoff = 0;
2897                         
2898                         /*
2899                          *      Recompute rto from rtt.  this eliminates any backoff.
2900                          */
2901 
2902                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
2903                         if (sk->rto > 120*HZ)
2904                                 sk->rto = 120*HZ;
2905                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
2906                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
2907                                                    .2 of a second is going to need huge windows (SIGH) */
2908                         sk->rto = 20;
2909                 }
2910         }
2911 
2912         /* 
2913          *      See if we can take anything off of the retransmit queue.
2914          */
2915    
2916         while(sk->send_head != NULL) 
2917         {
2918                 /* Check for a bug. */
2919                 if (sk->send_head->link3 &&
2920                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
2921                         printk("INET: tcp.c: *** bug send_list out of order.\n");
2922                 if (before(sk->send_head->h.seq, ack+1)) 
2923                 {
2924                         struct sk_buff *oskb;   
2925                         if (sk->retransmits) 
2926                         {       
2927                                 /*
2928                                  *      We were retransmitting.  don't count this in RTT est 
2929                                  */
2930                                 flag |= 2;
2931 
2932                                 /*
2933                                  * even though we've gotten an ack, we're still
2934                                  * retransmitting as long as we're sending from
2935                                  * the retransmit queue.  Keeping retransmits non-zero
2936                                  * prevents us from getting new data interspersed with
2937                                  * retransmissions.
2938                                  */
2939 
2940                                 if (sk->send_head->link3)
2941                                         sk->retransmits = 1;
2942                                 else
2943                                         sk->retransmits = 0;
2944                         }
2945                         /*
2946                          * Note that we only reset backoff and rto in the
2947                          * rtt recomputation code.  And that doesn't happen
2948                          * if there were retransmissions in effect.  So the
2949                          * first new packet after the retransmissions is
2950                          * sent with the backoff still in effect.  Not until
2951                          * we get an ack from a non-retransmitted packet do
2952                          * we reset the backoff and rto.  This allows us to deal
2953                          * with a situation where the network delay has increased
2954                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
2955                          */
2956 
2957                         /*
2958                          *      We have one less packet out there. 
2959                          */
2960                          
2961                         if (sk->packets_out > 0) 
2962                                 sk->packets_out --;
2963                         /* 
2964                          *      Wake up the process, it can probably write more. 
2965                          */
2966                         if (!sk->dead) 
2967                                 sk->write_space(sk);
2968                         oskb = sk->send_head;
2969 
2970                         if (!(flag&2)) 
2971                         {
2972                                 long m;
2973         
2974                                 /*
2975                                  *      The following amusing code comes from Jacobson's
2976                                  *      article in SIGCOMM '88.  Note that rtt and mdev
2977                                  *      are scaled versions of rtt and mean deviation.
2978                                  *      This is designed to be as fast as possible 
2979                                  *      m stands for "measurement".
2980                                  */
2981         
2982                                 m = jiffies - oskb->when;  /* RTT */
2983                                 if(m<=0)
2984                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
2985                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
2986                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
2987                                 if (m < 0)
2988                                         m = -m;         /* m is now abs(error) */
2989                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
2990                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
2991         
2992                                 /*
2993                                  *      Now update timeout.  Note that this removes any backoff.
2994                                  */
2995                          
2996                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
2997                                 if (sk->rto > 120*HZ)
2998                                         sk->rto = 120*HZ;
2999                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3000                                         sk->rto = 20;
3001                                 sk->backoff = 0;
3002                         }
3003                         flag |= (2|4);
3004                         cli();
3005                         oskb = sk->send_head;
3006                         IS_SKB(oskb);
3007                         sk->send_head = oskb->link3;
3008                         if (sk->send_head == NULL) 
3009                         {
3010                                 sk->send_tail = NULL;
3011                         }
3012 
3013                 /*
3014                  *      We may need to remove this from the dev send list. 
3015                  */
3016 
3017                         if (oskb->next)
3018                                 skb_unlink(oskb);
3019                         sti();
3020                         kfree_skb(oskb, FREE_WRITE); /* write. */
3021                         if (!sk->dead) 
3022                                 sk->write_space(sk);
3023                 }
3024                 else
3025                 {
3026                         break;
3027                 }
3028         }
3029 
3030         /*
3031          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3032          * returns non-NULL, we complete ignore the timer stuff in the else
3033          * clause.  We ought to organize the code so that else clause can
3034          * (should) be executed regardless, possibly moving the PROBE timer
3035          * reset over.  The skb_peek() thing should only move stuff to the
3036          * write queue, NOT also manage the timer functions.
3037          */
3038 
3039         /*
3040          * Maybe we can take some stuff off of the write queue,
3041          * and put it onto the xmit queue.
3042          */
3043         if (skb_peek(&sk->write_queue) != NULL) 
3044         {
3045                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3046                         (sk->retransmits == 0 || 
3047                          sk->timeout != TIME_WRITE ||
3048                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3049                         && sk->packets_out < sk->cong_window) 
3050                 {
3051                         flag |= 1;
3052                         tcp_write_xmit(sk);
3053                 }
3054                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3055                         sk->send_head == NULL &&
3056                         sk->ack_backlog == 0 &&
3057                         sk->state != TCP_TIME_WAIT) 
3058                 {
3059                         reset_timer(sk, TIME_PROBE0, sk->rto);
3060                 }               
3061         }
3062         else
3063         {
3064                 /*
3065                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3066                  * from TCP_CLOSE we don't do anything
3067                  *
3068                  * from anything else, if there is write data (or fin) pending,
3069                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3070                  * a KEEPALIVE timeout, else we delete the timer.
3071                  *
3072                  * We do not set flag for nominal write data, otherwise we may
3073                  * force a state where we start to write itsy bitsy tidbits
3074                  * of data.
3075                  */
3076 
3077                 switch(sk->state) {
3078                 case TCP_TIME_WAIT:
3079                         /*
3080                          * keep us in TIME_WAIT until we stop getting packets,
3081                          * reset the timeout.
3082                          */
3083                         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3084                         break;
3085                 case TCP_CLOSE:
3086                         /*
3087                          * don't touch the timer.
3088                          */
3089                         break;
3090                 default:
3091                         /*
3092                          * must check send_head, write_queue, and ack_backlog
3093                          * to determine which timeout to use.
3094                          */
3095                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3096                                 reset_timer(sk, TIME_WRITE, sk->rto);
3097                         } else if (sk->keepopen) {
3098                                 reset_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3099                         } else {
3100                                 delete_timer(sk);
3101                         }
3102                         break;
3103                 }
3104         }
3105 
3106         if (sk->packets_out == 0 && sk->partial != NULL &&
3107                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3108         {
3109                 flag |= 1;
3110                 tcp_send_partial(sk);
3111         }
3112 
3113         /*
3114          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3115          * we are now waiting for an acknowledge to our FIN.  The other end is
3116          * already in TIME_WAIT.
3117          *
3118          * Move to TCP_CLOSE on success.
3119          */
3120 
3121         if (sk->state == TCP_LAST_ACK) 
3122         {
3123                 if (!sk->dead)
3124                         sk->state_change(sk);
3125                 if(sk->debug)
3126                         printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3127                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3128                 if (sk->rcv_ack_seq == sk->write_seq && sk->acked_seq == sk->fin_seq) 
3129                 {
3130                         flag |= 1;
3131                         tcp_set_state(sk,TCP_CLOSE);
3132                         sk->shutdown = SHUTDOWN_MASK;
3133                 }
3134         }
3135 
3136         /*
3137          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3138          *
3139          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3140          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3141          */
3142 
3143         if (sk->state == TCP_FIN_WAIT1) 
3144         {
3145 
3146                 if (!sk->dead) 
3147                         sk->state_change(sk);
3148                 if (sk->rcv_ack_seq == sk->write_seq) 
3149                 {
3150                         flag |= 1;
3151                         sk->shutdown |= SEND_SHUTDOWN;
3152                         tcp_set_state(sk, TCP_FIN_WAIT2);
3153                 }
3154         }
3155 
3156         /*
3157          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3158          *
3159          *      Move to TIME_WAIT
3160          */
3161 
3162         if (sk->state == TCP_CLOSING) 
3163         {
3164 
3165                 if (!sk->dead) 
3166                         sk->state_change(sk);
3167                 if (sk->rcv_ack_seq == sk->write_seq) 
3168                 {
3169                         flag |= 1;
3170                         tcp_time_wait(sk);
3171                 }
3172         }
3173         
3174         /*
3175          *      Final ack of a three way shake 
3176          */
3177          
3178         if(sk->state==TCP_SYN_RECV)
3179         {
3180                 tcp_set_state(sk, TCP_ESTABLISHED);
3181                 tcp_options(sk,th);
3182                 sk->dummy_th.dest=th->source;
3183                 sk->copied_seq = sk->acked_seq;
3184                 if(!sk->dead)
3185                         sk->state_change(sk);
3186                 if(sk->max_window==0)
3187                 {
3188                         sk->max_window=32;
3189                         sk->mss=min(sk->max_window,sk->mtu);
3190                 }
3191         }
3192         
3193         /*
3194          * I make no guarantees about the first clause in the following
3195          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3196          * what conditions "!flag" would be true.  However I think the rest
3197          * of the conditions would prevent that from causing any
3198          * unnecessary retransmission. 
3199          *   Clearly if the first packet has expired it should be 
3200          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3201          * harder to explain:  You have to look carefully at how and when the
3202          * timer is set and with what timeout.  The most recent transmission always
3203          * sets the timer.  So in general if the most recent thing has timed
3204          * out, everything before it has as well.  So we want to go ahead and
3205          * retransmit some more.  If we didn't explicitly test for this
3206          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3207          * would not be true.  If you look at the pattern of timing, you can
3208          * show that rto is increased fast enough that the next packet would
3209          * almost never be retransmitted immediately.  Then you'd end up
3210          * waiting for a timeout to send each packet on the retransmission
3211          * queue.  With my implementation of the Karn sampling algorithm,
3212          * the timeout would double each time.  The net result is that it would
3213          * take a hideous amount of time to recover from a single dropped packet.
3214          * It's possible that there should also be a test for TIME_WRITE, but
3215          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3216          * got to be in real retransmission mode.
3217          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3218          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3219          * As long as no further losses occur, this seems reasonable.
3220          */
3221         
3222         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3223                (((flag&2) && sk->retransmits) ||
3224                (sk->send_head->when + sk->rto < jiffies))) 
3225         {
3226                 if(sk->send_head->when + sk->rto < jiffies)
3227                         tcp_retransmit(sk,0);   
3228                 else
3229                 {
3230                         tcp_do_retransmit(sk, 1);
3231                         reset_timer(sk, TIME_WRITE, sk->rto);
3232                 }
3233         }
3234 
3235         return(1);
3236 }
3237 
3238 
3239 /*
3240  *      Process the FIN bit. This now behaves as it is supposed to work
3241  *      and the FIN takes effect when it is validly part of sequence
3242  *      space. Not before when we get holes.
3243  *
3244  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3245  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3246  *      TIME-WAIT)
3247  *
3248  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3249  *      close and we go into CLOSING (and later onto TIME-WAIT)
3250  *
3251  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3252  *
3253  */
3254  
3255 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3256 {
3257         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3258 
3259         if (!sk->dead) 
3260         {
3261                 sk->state_change(sk);
3262         }
3263 
3264         switch(sk->state) 
3265         {
3266                 case TCP_SYN_RECV:
3267                 case TCP_SYN_SENT:
3268                 case TCP_ESTABLISHED:
3269                         /*
3270                          * move to CLOSE_WAIT, tcp_data() already handled
3271                          * sending the ack.
3272                          */     /* Check me --------------vvvvvvv */
3273                         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3274                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3275                         if (th->rst)
3276                                 sk->shutdown = SHUTDOWN_MASK;
3277                         break;
3278 
3279                 case TCP_CLOSE_WAIT:
3280                 case TCP_CLOSING:
3281                         /*
3282                          * received a retransmission of the FIN, do
3283                          * nothing.
3284                          */
3285                         break;
3286                 case TCP_TIME_WAIT:
3287                         /*
3288                          * received a retransmission of the FIN,
3289                          * restart the TIME_WAIT timer.
3290                          */
3291                         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3292                         return(0);
3293                 case TCP_FIN_WAIT1:
3294                         /*
3295                          * This case occurs when a simultaneous close
3296                          * happens, we must ack the received FIN and
3297                          * enter the CLOSING state.
3298                          *
3299                          * This causes a WRITE timeout, which will either
3300                          * move on to TIME_WAIT when we timeout, or resend
3301                          * the FIN properly (maybe we get rid of that annoying
3302                          * FIN lost hang). The TIME_WRITE code is already correct
3303                          * for handling this timeout.
3304                          */
3305 
3306                         if(sk->timeout != TIME_WRITE)
3307                                 reset_timer(sk, TIME_WRITE, sk->rto);
3308                         tcp_set_state(sk,TCP_CLOSING);
3309                         break;
3310                 case TCP_FIN_WAIT2:
3311                         /*
3312                          * received a FIN -- send ACK and enter TIME_WAIT
3313                          */
3314                         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3315                         sk->shutdown|=SHUTDOWN_MASK;
3316                         tcp_set_state(sk,TCP_TIME_WAIT);
3317                         break;
3318                 case TCP_CLOSE:
3319                         /*
3320                          * already in CLOSE
3321                          */
3322                         break;
3323                 default:
3324                         tcp_set_state(sk,TCP_LAST_ACK);
3325         
3326                         /* Start the timers. */
3327                         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3328                         return(0);
3329         }
3330 
3331         return(0);
3332 }
3333 
3334 
3335 
3336 /*
3337  *      This routine handles the data.  If there is room in the buffer,
3338  *      it will be have already been moved into it.  If there is no
3339  *      room, then we will just have to discard the packet.
3340  */
3341 
3342 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
3343          unsigned long saddr, unsigned short len)
3344 {
3345         struct sk_buff *skb1, *skb2;
3346         struct tcphdr *th;
3347         int dup_dumped=0;
3348         unsigned long new_seq;
3349         unsigned long shut_seq;
3350 
3351         th = skb->h.th;
3352         skb->len = len -(th->doff*4);
3353 
3354         /*
3355          *      The bytes in the receive read/assembly queue has increased. Needed for the
3356          *      low memory discard algorithm 
3357          */
3358            
3359         sk->bytes_rcv += skb->len;
3360         
3361         if (skb->len == 0 && !th->fin && !th->urg && !th->psh) 
3362         {
3363                 /* 
3364                  *      Don't want to keep passing ack's back and forth. 
3365                  *      (someone sent us dataless, boring frame)
3366                  */
3367                 if (!th->ack)
3368                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3369                 kfree_skb(skb, FREE_READ);
3370                 return(0);
3371         }
3372         
3373         /*
3374          *      We no longer have anyone receiving data on this connection.
3375          */
3376 
3377         if(sk->shutdown & RCV_SHUTDOWN)
3378         {
3379                 /*
3380                  *      FIXME: BSD has some magic to avoid sending resets to
3381                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3382                  *      BSD stacks still have broken keepalives so we want to
3383                  *      cope with it.
3384                  */
3385                  
3386                 if(skb->len)    /* We don't care if its just an ack or
3387                                    a keepalive/window probe */
3388                 {
3389                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3390                         
3391                         /* Do this the way 4.4BSD treats it. Not what I'd
3392                            regard as the meaning of the spec but its what BSD
3393                            does and clearly they know everything 8) */
3394                         
3395                         /*
3396                          *      This is valid because of two things
3397                          *
3398                          *      a) The way tcp_data behaves at the bottom.
3399                          *      b) A fin takes effect when read not when received.
3400                          */
3401                          
3402                         shut_seq=sk->acked_seq+1;       /* Last byte */
3403                         
3404                         if(after(new_seq,shut_seq))
3405                         {
3406                                 if(sk->debug)
3407                                         printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3408                                                 sk, new_seq, shut_seq, sk->blog);
3409                                 if(sk->dead)
3410                                 {
3411                                         sk->acked_seq = new_seq + th->fin;
3412                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3413                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3414                                         tcp_statistics.TcpEstabResets++;
3415                                         tcp_set_state(sk,TCP_CLOSE);
3416                                         sk->err = EPIPE;
3417                                         sk->shutdown = SHUTDOWN_MASK;
3418                                         kfree_skb(skb, FREE_READ);
3419                                         return 0;
3420                                 }
3421                         }
3422                 }
3423         }
3424 
3425         /*
3426          *      Now we have to walk the chain, and figure out where this one
3427          *      goes into it.  This is set up so that the last packet we received
3428          *      will be the first one we look at, that way if everything comes
3429          *      in order, there will be no performance loss, and if they come
3430          *      out of order we will be able to fit things in nicely.
3431          *
3432          *      [AC: This is wrong. We should assume in order first and then walk
3433          *       forwards from the first hole based upon real traffic patterns.]
3434          *      
3435          */
3436 
3437         /* 
3438          *      This should start at the last one, and then go around forwards.
3439          */
3440 
3441         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3442         {
3443                 skb_queue_head(&sk->receive_queue,skb);
3444                 skb1= NULL;
3445         } 
3446         else
3447         {
3448                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3449                 {
3450                         if(sk->debug)
3451                         {
3452                                 printk("skb1=%p :", skb1);
3453                                 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3454                                 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3455                                 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3456                                                 sk->acked_seq);
3457                         }
3458                         
3459                         /*
3460                          *      Optimisation: Duplicate frame or extension of previous frame from
3461                          *      same sequence point (lost ack case).
3462                          *      The frame contains duplicate data or replaces a previous frame
3463                          *      discard the previous frame (safe as sk->inuse is set) and put
3464                          *      the new one in its place.
3465                          */
3466                          
3467                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3468                         {
3469                                 skb_append(skb1,skb);
3470                                 skb_unlink(skb1);
3471                                 kfree_skb(skb1,FREE_READ);
3472                                 dup_dumped=1;
3473                                 skb1=NULL;
3474                                 break;
3475                         }
3476                         
3477                         /*
3478                          *      Found where it fits
3479                          */
3480                          
3481                         if (after(th->seq+1, skb1->h.th->seq))
3482                         {
3483                                 skb_append(skb1,skb);
3484                                 break;
3485                         }
3486                         
3487                         /*
3488                          *      See if we've hit the start. If so insert.
3489                          */
3490                         if (skb1 == skb_peek(&sk->receive_queue))
3491                         {
3492                                 skb_queue_head(&sk->receive_queue, skb);
3493                                 break;
3494                         }
3495                 }
3496         }
3497 
3498         /*
3499          *      Figure out what the ack value for this frame is
3500          */
3501          
3502         th->ack_seq = th->seq + skb->len;
3503         if (th->syn) 
3504                 th->ack_seq++;
3505         if (th->fin)
3506                 th->ack_seq++;
3507 
3508         if (before(sk->acked_seq, sk->copied_seq)) 
3509         {
3510                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3511                 sk->acked_seq = sk->copied_seq;
3512         }
3513 
3514         /*
3515          *      Now figure out if we can ack anything. This is very messy because we really want two
3516          *      receive queues, a completed and an assembly queue. We also want only one transmit
3517          *      queue.
3518          */
3519 
3520         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3521         {
3522                 if (before(th->seq, sk->acked_seq+1)) 
3523                 {
3524                         int newwindow;
3525 
3526                         if (after(th->ack_seq, sk->acked_seq)) 
3527                         {
3528                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3529                                 if (newwindow < 0)
3530                                         newwindow = 0;  
3531                                 sk->window = newwindow;
3532                                 sk->acked_seq = th->ack_seq;
3533                         }
3534                         skb->acked = 1;
3535 
3536                         /*
3537                          *      When we ack the fin, we do the FIN 
3538                          *      processing.
3539                          */
3540 
3541                         if (skb->h.th->fin) 
3542                         {
3543                                 tcp_fin(skb,sk,skb->h.th);
3544                         }
3545           
3546                         for(skb2 = skb->next;
3547                             skb2 != (struct sk_buff *)&sk->receive_queue;
3548                             skb2 = skb2->next) 
3549                         {
3550                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
3551                                 {
3552                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
3553                                         {
3554                                                 newwindow = sk->window -
3555                                                  (skb2->h.th->ack_seq - sk->acked_seq);
3556                                                 if (newwindow < 0)
3557                                                         newwindow = 0;  
3558                                                 sk->window = newwindow;
3559                                                 sk->acked_seq = skb2->h.th->ack_seq;
3560                                         }
3561                                         skb2->acked = 1;
3562                                         /*
3563                                          *      When we ack the fin, we do
3564                                          *      the fin handling.
3565                                          */
3566                                         if (skb2->h.th->fin) 
3567                                         {
3568                                                 tcp_fin(skb,sk,skb->h.th);
3569                                         }
3570 
3571                                         /*
3572                                          *      Force an immediate ack.
3573                                          */
3574                                          
3575                                         sk->ack_backlog = sk->max_ack_backlog;
3576                                 }
3577                                 else
3578                                 {
3579                                         break;
3580                                 }
3581                         }
3582 
3583                         /*
3584                          *      This also takes care of updating the window.
3585                          *      This if statement needs to be simplified.
3586                          */
3587                         if (!sk->delay_acks ||
3588                             sk->ack_backlog >= sk->max_ack_backlog || 
3589                             sk->bytes_rcv > sk->max_unacked || th->fin) {
3590         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
3591                         }
3592                         else 
3593                         {
3594                                 sk->ack_backlog++;
3595                                 if(sk->debug)
3596                                         printk("Ack queued.\n");
3597                                 reset_timer(sk, TIME_WRITE, TCP_ACK_TIME);
3598                         }
3599                 }
3600         }
3601 
3602         /*
3603          *      If we've missed a packet, send an ack.
3604          *      Also start a timer to send another.
3605          */
3606          
3607         if (!skb->acked) 
3608         {
3609         
3610         /*
3611          *      This is important.  If we don't have much room left,
3612          *      we need to throw out a few packets so we have a good
3613          *      window.  Note that mtu is used, not mss, because mss is really
3614          *      for the send side.  He could be sending us stuff as large as mtu.
3615          */
3616                  
3617                 while (sk->prot->rspace(sk) < sk->mtu) 
3618                 {
3619                         skb1 = skb_peek(&sk->receive_queue);
3620                         if (skb1 == NULL) 
3621                         {
3622                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
3623                                 break;
3624                         }
3625 
3626                         /*
3627                          *      Don't throw out something that has been acked. 
3628                          */
3629                  
3630                         if (skb1->acked) 
3631                         {
3632                                 break;
3633                         }
3634                 
3635                         skb_unlink(skb1);
3636                         kfree_skb(skb1, FREE_READ);
3637                 }
3638                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
3639                 sk->ack_backlog++;
3640                 reset_timer(sk, TIME_WRITE, TCP_ACK_TIME);
3641         }
3642         else
3643         {
3644                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
3645         }
3646 
3647         /*
3648          *      Now tell the user we may have some data. 
3649          */
3650          
3651         if (!sk->dead) 
3652         {
3653                 if(sk->debug)
3654                         printk("Data wakeup.\n");
3655                 sk->data_ready(sk,0);
3656         } 
3657         return(0);
3658 }
3659 
3660 
3661 /*
3662  *      This routine is only called when we have urgent data
3663  *      signalled. Its the 'slow' part of tcp_urg. It could be
3664  *      moved inline now as tcp_urg is only called from one
3665  *      place. We handle URGent data wrong. We have to - as
3666  *      BSD still doesn't use the correction from RFC961.
3667  */
3668  
3669 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
3670 {
3671         unsigned long ptr = ntohs(th->urg_ptr);
3672 
3673         if (ptr)
3674                 ptr--;
3675         ptr += th->seq;
3676 
3677         /* ignore urgent data that we've already seen and read */
3678         if (after(sk->copied_seq, ptr))
3679                 return;
3680 
3681         /* do we already have a newer (or duplicate) urgent pointer? */
3682         if (sk->urg_data && !after(ptr, sk->urg_seq))
3683                 return;
3684 
3685         /* tell the world about our new urgent pointer */
3686         if (sk->proc != 0) {
3687                 if (sk->proc > 0) {
3688                         kill_proc(sk->proc, SIGURG, 1);
3689                 } else {
3690                         kill_pg(-sk->proc, SIGURG, 1);
3691                 }
3692         }
3693         sk->urg_data = URG_NOTYET;
3694         sk->urg_seq = ptr;
3695 }
3696 
3697 /*
3698  *      This is the 'fast' part of urgent handling.
3699  */
3700  
3701 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
3702         unsigned long saddr, unsigned long len)
3703 {
3704         unsigned long ptr;
3705 
3706         /*
3707          *      Check if we get a new urgent pointer - normally not 
3708          */
3709          
3710         if (th->urg)
3711                 tcp_check_urg(sk,th);
3712 
3713         /*
3714          *      Do we wait for any urgent data? - normally not
3715          */
3716          
3717         if (sk->urg_data != URG_NOTYET)
3718                 return 0;
3719 
3720         /*
3721          *      Is the urgent pointer pointing into this packet? 
3722          */
3723          
3724         ptr = sk->urg_seq - th->seq + th->doff*4;
3725         if (ptr >= len)
3726                 return 0;
3727 
3728         /*
3729          *      Ok, got the correct packet, update info 
3730          */
3731          
3732         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
3733         if (!sk->dead)
3734                 sk->data_ready(sk,0);
3735         return 0;
3736 }
3737 
3738 /*
3739  *      This will accept the next outstanding connection. 
3740  */
3741  
3742 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
3743 {
3744         struct sock *newsk;
3745         struct sk_buff *skb;
3746   
3747   /*
3748    * We need to make sure that this socket is listening,
3749    * and that it has something pending.
3750    */
3751 
3752         if (sk->state != TCP_LISTEN) 
3753         {
3754                 sk->err = EINVAL;
3755                 return(NULL); 
3756         }
3757 
3758         /* Avoid the race. */
3759         cli();
3760         sk->inuse = 1;
3761 
3762         while((skb = tcp_dequeue_established(sk)) == NULL) 
3763         {
3764                 if (flags & O_NONBLOCK) 
3765                 {
3766                         sti();
3767                         release_sock(sk);
3768                         sk->err = EAGAIN;
3769                         return(NULL);
3770                 }
3771 
3772                 release_sock(sk);
3773                 interruptible_sleep_on(sk->sleep);
3774                 if (current->signal & ~current->blocked) 
3775                 {
3776                         sti();
3777                         sk->err = ERESTARTSYS;
3778                         return(NULL);
3779                 }
3780                 sk->inuse = 1;
3781         }
3782         sti();
3783 
3784         /*
3785          *      Now all we need to do is return skb->sk. 
3786          */
3787 
3788         newsk = skb->sk;
3789 
3790         kfree_skb(skb, FREE_READ);
3791         sk->ack_backlog--;
3792         release_sock(sk);
3793         return(newsk);
3794 }
3795 
3796 
3797 /*
3798  *      This will initiate an outgoing connection. 
3799  */
3800  
3801 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
3802 {
3803         struct sk_buff *buff;
3804         struct device *dev=NULL;
3805         unsigned char *ptr;
3806         int tmp;
3807         int atype;
3808         struct tcphdr *t1;
3809         struct rtable *rt;
3810 
3811         if (sk->state != TCP_CLOSE) 
3812         {
3813                 return(-EISCONN);
3814         }
3815         
3816         if (addr_len < 8) 
3817                 return(-EINVAL);
3818 
3819         if (usin->sin_family && usin->sin_family != AF_INET) 
3820                 return(-EAFNOSUPPORT);
3821 
3822         /*
3823          *      connect() to INADDR_ANY means loopback (BSD'ism).
3824          */
3825         
3826         if(usin->sin_addr.s_addr==INADDR_ANY)
3827                 usin->sin_addr.s_addr=ip_my_addr();
3828                   
3829         /*
3830          *      Don't want a TCP connection going to a broadcast address 
3831          */
3832 
3833         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
3834                 return -ENETUNREACH;
3835   
3836         sk->inuse = 1;
3837         sk->daddr = usin->sin_addr.s_addr;
3838         sk->write_seq = jiffies * SEQ_TICK - seq_offset;
3839         sk->window_seq = sk->write_seq;
3840         sk->rcv_ack_seq = sk->write_seq -1;
3841         sk->err = 0;
3842         sk->dummy_th.dest = usin->sin_port;
3843         release_sock(sk);
3844 
3845         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
3846         if (buff == NULL) 
3847         {
3848                 return(-ENOMEM);
3849         }
3850         sk->inuse = 1;
3851         buff->len = 24;
3852         buff->sk = sk;
3853         buff->free = 1;
3854         buff->localroute = sk->localroute;
3855         
3856         t1 = (struct tcphdr *) buff->data;
3857 
3858         /*
3859          *      Put in the IP header and routing stuff. 
3860          */
3861          
3862         rt=ip_rt_route(sk->daddr, NULL, NULL);
3863         
3864 
3865         /*
3866          *      We need to build the routing stuff from the things saved in skb. 
3867          */
3868 
3869         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
3870                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3871         if (tmp < 0) 
3872         {
3873                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
3874                 release_sock(sk);
3875                 return(-ENETUNREACH);
3876         }
3877 
3878         buff->len += tmp;
3879         t1 = (struct tcphdr *)((char *)t1 +tmp);
3880 
3881         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
3882         t1->seq = ntohl(sk->write_seq++);
3883         sk->sent_seq = sk->write_seq;
3884         buff->h.seq = sk->write_seq;
3885         t1->ack = 0;
3886         t1->window = 2;
3887         t1->res1=0;
3888         t1->res2=0;
3889         t1->rst = 0;
3890         t1->urg = 0;
3891         t1->psh = 0;
3892         t1->syn = 1;
3893         t1->urg_ptr = 0;
3894         t1->doff = 6;
3895         /* use 512 or whatever user asked for */
3896         
3897         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3898                 sk->window_clamp=rt->rt_window;
3899         else
3900                 sk->window_clamp=0;
3901 
3902         if (sk->user_mss)
3903                 sk->mtu = sk->user_mss;
3904         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
3905                 sk->mtu = rt->rt_mss;
3906         else 
3907         {
3908 #ifdef CONFIG_INET_SNARL
3909                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
3910 #else
3911                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
3912 #endif
3913                         sk->mtu = 576 - HEADER_SIZE;
3914                 else
3915                         sk->mtu = MAX_WINDOW;
3916         }
3917         /*
3918          *      but not bigger than device MTU 
3919          */
3920 
3921         if(sk->mtu <32)
3922                 sk->mtu = 32;   /* Sanity limit */
3923                 
3924         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
3925         
3926         /*
3927          *      Put in the TCP options to say MTU. 
3928          */
3929 
3930         ptr = (unsigned char *)(t1+1);
3931         ptr[0] = 2;
3932         ptr[1] = 4;
3933         ptr[2] = (sk->mtu) >> 8;
3934         ptr[3] = (sk->mtu) & 0xff;
3935         tcp_send_check(t1, sk->saddr, sk->daddr,
3936                   sizeof(struct tcphdr) + 4, sk);
3937 
3938         /*
3939          *      This must go first otherwise a really quick response will get reset. 
3940          */
3941 
3942         tcp_set_state(sk,TCP_SYN_SENT);
3943         sk->rto = TCP_TIMEOUT_INIT;
3944         reset_timer(sk, TIME_WRITE, sk->rto);   /* Timer for repeating the SYN until an answer */
3945         sk->retransmits = TCP_RETR2 - TCP_SYN_RETRIES;
3946 
3947         sk->prot->queue_xmit(sk, dev, buff, 0);  
3948         tcp_statistics.TcpActiveOpens++;
3949         tcp_statistics.TcpOutSegs++;
3950   
3951         release_sock(sk);
3952         return(0);
3953 }
3954 
3955 
3956 /* This functions checks to see if the tcp header is actually acceptable. */
3957 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
3958              struct options *opt, unsigned long saddr, struct device *dev)
3959 {
3960         unsigned long next_seq;
3961 
3962         next_seq = len - 4*th->doff;
3963         if (th->fin)
3964                 next_seq++;
3965         /* if we have a zero window, we can't have any data in the packet.. */
3966         if (next_seq && !sk->window)
3967                 goto ignore_it;
3968         next_seq += th->seq;
3969 
3970         /*
3971          * This isn't quite right.  sk->acked_seq could be more recent
3972          * than sk->window.  This is however close enough.  We will accept
3973          * slightly more packets than we should, but it should not cause
3974          * problems unless someone is trying to forge packets.
3975          */
3976 
3977         /* have we already seen all of this packet? */
3978         if (!after(next_seq+1, sk->acked_seq))
3979                 goto ignore_it;
3980         /* or does it start beyond the window? */
3981         if (!before(th->seq, sk->acked_seq + sk->window + 1))
3982                 goto ignore_it;
3983 
3984         /* ok, at least part of this packet would seem interesting.. */
3985         return 1;
3986 
3987 ignore_it:
3988         if (th->rst)
3989                 return 0;
3990 
3991         /*
3992          *      Send a reset if we get something not ours and we are
3993          *      unsynchronized. Note: We don't do anything to our end. We
3994          *      are just killing the bogus remote connection then we will
3995          *      connect again and it will work (with luck).
3996          */
3997          
3998         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
3999         {
4000                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4001                 return 1;
4002         }
4003 
4004         /* Try to resync things. */
4005         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4006         return 0;
4007 }
4008 
4009 /*
4010  *      When we get a reset we do this.
4011  */
4012 
4013 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4014 {
4015         sk->zapped = 1;
4016         sk->err = ECONNRESET;
4017         if (sk->state == TCP_SYN_SENT)
4018                 sk->err = ECONNREFUSED;
4019         if (sk->state == TCP_CLOSE_WAIT)
4020                 sk->err = EPIPE;
4021 #ifdef TCP_DO_RFC1337           
4022         /*
4023          *      Time wait assassination protection [RFC1337]
4024          */
4025         if(sk->state!=TCP_TIME_WAIT)
4026         {       
4027                 tcp_set_state(sk,TCP_CLOSE);
4028                 sk->shutdown = SHUTDOWN_MASK;
4029         }
4030 #else   
4031         tcp_set_state(sk,TCP_CLOSE);
4032         sk->shutdown = SHUTDOWN_MASK;
4033 #endif  
4034         if (!sk->dead) 
4035                 sk->state_change(sk);
4036         kfree_skb(skb, FREE_READ);
4037         release_sock(sk);
4038         return(0);
4039 }
4040 
4041 /*
4042  *      A TCP packet has arrived.
4043  */
4044  
4045 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4046         unsigned long daddr, unsigned short len,
4047         unsigned long saddr, int redo, struct inet_protocol * protocol)
4048 {
4049         struct tcphdr *th;
4050         struct sock *sk;
4051         int syn_ok=0;
4052         
4053         if (!skb) 
4054         {
4055                 printk("IMPOSSIBLE 1\n");
4056                 return(0);
4057         }
4058 
4059         if (!dev) 
4060         {
4061                 printk("IMPOSSIBLE 2\n");
4062                 return(0);
4063         }
4064   
4065         tcp_statistics.TcpInSegs++;
4066   
4067         if(skb->pkt_type!=PACKET_HOST)
4068         {
4069                 kfree_skb(skb,FREE_READ);
4070                 return(0);
4071         }
4072   
4073         th = skb->h.th;
4074 
4075         /*
4076          *      Find the socket.
4077          */
4078 
4079         sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4080 
4081         /*
4082          *      If this socket has got a reset its to all intents and purposes 
4083          *      really dead. Count closed sockets as dead.
4084          *
4085          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4086          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4087          *      exist so should cause resets as if the port was unreachable.
4088          */
4089          
4090         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4091                 sk=NULL;
4092 
4093         if (!redo) 
4094         {
4095                 if (tcp_check(th, len, saddr, daddr )) 
4096                 {
4097                         skb->sk = NULL;
4098                         kfree_skb(skb,FREE_READ);
4099                         /*
4100                          * We don't release the socket because it was
4101                          * never marked in use.
4102                          */
4103                         return(0);
4104                 }
4105                 th->seq = ntohl(th->seq);
4106 
4107                 /* See if we know about the socket. */
4108                 if (sk == NULL) 
4109                 {
4110                         /*
4111                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4112                          */
4113                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4114                         skb->sk = NULL;
4115                         /*
4116                          *      Discard frame
4117                          */
4118                         kfree_skb(skb, FREE_READ);
4119                         return(0);
4120                 }
4121 
4122                 skb->len = len;
4123                 skb->acked = 0;
4124                 skb->used = 0;
4125                 skb->free = 0;
4126                 skb->saddr = daddr;
4127                 skb->daddr = saddr;
4128         
4129                 /* We may need to add it to the backlog here. */
4130                 cli();
4131                 if (sk->inuse) 
4132                 {
4133                         skb_queue_tail(&sk->back_log, skb);
4134                         sti();
4135                         return(0);
4136                 }
4137                 sk->inuse = 1;
4138                 sti();
4139         }
4140         else
4141         {
4142                 if (sk==NULL) 
4143                 {
4144                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4145                         skb->sk = NULL;
4146                         kfree_skb(skb, FREE_READ);
4147                         return(0);
4148                 }
4149         }
4150 
4151 
4152         if (!sk->prot) 
4153         {
4154                 printk("IMPOSSIBLE 3\n");
4155                 return(0);
4156         }
4157 
4158 
4159         /*
4160          *      Charge the memory to the socket. 
4161          */
4162          
4163         if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
4164         {
4165                 kfree_skb(skb, FREE_READ);
4166                 release_sock(sk);
4167                 return(0);
4168         }
4169 
4170         skb->sk=sk;
4171         sk->rmem_alloc += skb->mem_len;
4172 
4173         /*
4174          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4175          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4176          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4177          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4178          */
4179 
4180         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4181         {
4182         
4183                 /*
4184                  *      Now deal with unusual cases.
4185                  */
4186          
4187                 if(sk->state==TCP_LISTEN)
4188                 {
4189                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4190                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4191 
4192                         /*
4193                          *      We don't care for RST, and non SYN are absorbed (old segments)
4194                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4195                          *      netmask on a running connection it can go broadcast. Even Sun's have
4196                          *      this problem so I'm ignoring it 
4197                          */
4198                            
4199                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4200                         {
4201                                 kfree_skb(skb, FREE_READ);
4202                                 release_sock(sk);
4203                                 return 0;
4204                         }
4205                 
4206                         /*      
4207                          *      Guess we need to make a new socket up 
4208                          */
4209                 
4210                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4211                 
4212                         /*
4213                          *      Now we have several options: In theory there is nothing else
4214                          *      in the frame. KA9Q has an option to send data with the syn,
4215                          *      BSD accepts data with the syn up to the [to be] advertised window
4216                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4217                          *      it, that fits the spec precisely and avoids incompatibilities. It
4218                          *      would be nice in future to drop through and process the data.
4219                          */
4220                          
4221                         release_sock(sk);
4222                         return 0;
4223                 }
4224         
4225                 /* retransmitted SYN? */
4226                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4227                 {
4228                         kfree_skb(skb, FREE_READ);
4229                         release_sock(sk);
4230                         return 0;
4231                 }
4232                 
4233                 /*
4234                  *      SYN sent means we have to look for a suitable ack and either reset
4235                  *      for bad matches or go to connected 
4236                  */
4237            
4238                 if(sk->state==TCP_SYN_SENT)
4239                 {
4240                         /* Crossed SYN or previous junk segment */
4241                         if(th->ack)
4242                         {
4243                                 /* We got an ack, but its not a good ack */
4244                                 if(!tcp_ack(sk,th,saddr,len))
4245                                 {
4246                                         /* Reset the ack - its an ack from a 
4247                                            different connection  [ th->rst is checked in tcp_reset()] */
4248                                         tcp_statistics.TcpAttemptFails++;
4249                                         tcp_reset(daddr, saddr, th,
4250                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4251                                         kfree_skb(skb, FREE_READ);
4252                                         release_sock(sk);
4253                                         return(0);
4254                                 }
4255                                 if(th->rst)
4256                                         return tcp_std_reset(sk,skb);
4257                                 if(!th->syn)
4258                                 {
4259                                         /* A valid ack from a different connection
4260                                            start. Shouldn't happen but cover it */
4261                                         kfree_skb(skb, FREE_READ);
4262                                         release_sock(sk);
4263                                         return 0;
4264                                 }
4265                                 /*
4266                                  *      Ok.. its good. Set up sequence numbers and
4267                                  *      move to established.
4268                                  */
4269                                 syn_ok=1;       /* Don't reset this connection for the syn */
4270                                 sk->acked_seq=th->seq+1;
4271                                 sk->fin_seq=th->seq;
4272                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4273                                 tcp_set_state(sk, TCP_ESTABLISHED);
4274                                 tcp_options(sk,th);
4275                                 sk->dummy_th.dest=th->source;
4276                                 sk->copied_seq = sk->acked_seq;
4277                                 if(!sk->dead)
4278                                         sk->state_change(sk);
4279                                 if(sk->max_window==0)
4280                                 {
4281                                         sk->max_window = 32;
4282                                         sk->mss = min(sk->max_window, sk->mtu);
4283                                 }
4284                         }
4285                         else
4286                         {
4287                                 /* See if SYN's cross. Drop if boring */
4288                                 if(th->syn && !th->rst)
4289                                 {
4290                                         /* Crossed SYN's are fine - but talking to
4291                                            yourself is right out... */
4292                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4293                                                 sk->dummy_th.source==th->source &&
4294                                                 sk->dummy_th.dest==th->dest)
4295                                         {
4296                                                 tcp_statistics.TcpAttemptFails++;
4297                                                 return tcp_std_reset(sk,skb);
4298                                         }
4299                                         tcp_set_state(sk,TCP_SYN_RECV);
4300                                         
4301                                         /*
4302                                          *      FIXME:
4303                                          *      Must send SYN|ACK here
4304                                          */
4305                                 }               
4306                                 /* Discard junk segment */
4307                                 kfree_skb(skb, FREE_READ);
4308                                 release_sock(sk);
4309                                 return 0;
4310                         }
4311                         /*
4312                          *      SYN_RECV with data maybe.. drop through
4313                          */
4314                         goto rfc_step6;
4315                 }
4316 
4317         /* BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4318            a more complex suggestion for fixing these reuse issues in RFC1644
4319            but not yet ready for general use. Also see RFC1379.*/
4320         
4321 #define BSD_TIME_WAIT
4322 #ifdef BSD_TIME_WAIT
4323                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4324                         after(th->seq, sk->acked_seq) && !th->rst)
4325                 {
4326                         long seq=sk->write_seq;
4327                         if(sk->debug)
4328                                 printk("Doing a BSD time wait\n");
4329                         tcp_statistics.TcpEstabResets++;           
4330                         sk->rmem_alloc -= skb->mem_len;
4331                         skb->sk = NULL;
4332                         sk->err=ECONNRESET;
4333                         tcp_set_state(sk, TCP_CLOSE);
4334                         sk->shutdown = SHUTDOWN_MASK;
4335                         release_sock(sk);
4336                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4337                         if (sk && sk->state==TCP_LISTEN)
4338                         {
4339                                 sk->inuse=1;
4340                                 skb->sk = sk;
4341                                 sk->rmem_alloc += skb->mem_len;
4342                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4343                                 release_sock(sk);
4344                                 return 0;
4345                         }
4346                         kfree_skb(skb, FREE_READ);
4347                         return 0;
4348                 }
4349 #endif  
4350         }
4351 
4352         /* We are now in normal data flow (see the step list in the RFC) */
4353         /* Note most of these are inline now. I'll inline the lot when
4354            I have time to test it hard and look at what gcc outputs */
4355         
4356         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4357         {
4358                 kfree_skb(skb, FREE_READ);
4359                 release_sock(sk);
4360                 return 0;
4361         }
4362 
4363         if(th->rst)
4364                 return tcp_std_reset(sk,skb);
4365         
4366         /*
4367          *      !syn_ok is effectively the state test in RFC793.
4368          */
4369          
4370         if(th->syn && !syn_ok)
4371         {
4372                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4373                 return tcp_std_reset(sk,skb);   
4374         }
4375 
4376         /*
4377          *      Process the ACK
4378          */
4379          
4380 
4381         if(th->ack && !tcp_ack(sk,th,saddr,len))
4382         {
4383                 /*
4384                  *      Our three way handshake failed.
4385                  */
4386                  
4387                 if(sk->state==TCP_SYN_RECV)
4388                 {
4389                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4390                 }
4391                 kfree_skb(skb, FREE_READ);
4392                 release_sock(sk);
4393                 return 0;
4394         }
4395         
4396 rfc_step6:              /* I'll clean this up later */
4397 
4398         /*
4399          *      Process urgent data
4400          */
4401                 
4402         if(tcp_urg(sk, th, saddr, len))
4403         {
4404                 kfree_skb(skb, FREE_READ);
4405                 release_sock(sk);
4406                 return 0;
4407         }
4408         
4409         
4410         /*
4411          *      Process the encapsulated data
4412          */
4413         
4414         if(tcp_data(skb,sk, saddr, len))
4415         {
4416                 kfree_skb(skb, FREE_READ);
4417                 release_sock(sk);
4418                 return 0;
4419         }
4420 
4421         /*
4422          *      And done
4423          */     
4424         
4425         release_sock(sk);
4426         return 0;
4427 }
4428 
4429 /*
4430  *      This routine sends a packet with an out of date sequence
4431  *      number. It assumes the other end will try to ack it.
4432  */
4433 
4434 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4435 {
4436         struct sk_buff *buff;
4437         struct tcphdr *t1;
4438         struct device *dev=NULL;
4439         int tmp;
4440 
4441         if (sk->zapped)
4442                 return; /* After a valid reset we can send no more */
4443 
4444         /*
4445          *      Write data can still be transmitted/retransmitted in the
4446          *      following states.  If any other state is encountered, return.
4447          */
4448 
4449         if (sk->state != TCP_ESTABLISHED && 
4450             sk->state != TCP_CLOSE_WAIT &&
4451             sk->state != TCP_FIN_WAIT1 && 
4452             sk->state != TCP_LAST_ACK &&
4453             sk->state != TCP_CLOSING
4454         ) 
4455         {
4456                 return;
4457         }
4458 
4459         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4460         if (buff == NULL) 
4461                 return;
4462 
4463         buff->len = sizeof(struct tcphdr);
4464         buff->free = 1;
4465         buff->sk = sk;
4466         buff->localroute = sk->localroute;
4467 
4468         t1 = (struct tcphdr *) buff->data;
4469 
4470         /* Put in the IP header and routing stuff. */
4471         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4472                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4473         if (tmp < 0) 
4474         {
4475                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4476                 return;
4477         }
4478 
4479         buff->len += tmp;
4480         t1 = (struct tcphdr *)((char *)t1 +tmp);
4481 
4482         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4483 
4484         /*
4485          * Use a previous sequence.
4486          * This should cause the other end to send an ack.
4487          */
4488         t1->seq = htonl(sk->sent_seq-1);
4489         t1->ack = 1; 
4490         t1->res1= 0;
4491         t1->res2= 0;
4492         t1->rst = 0;
4493         t1->urg = 0;
4494         t1->psh = 0;
4495         t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
4496         t1->syn = 0;
4497         t1->ack_seq = ntohl(sk->acked_seq);
4498         t1->window = ntohs(tcp_select_window(sk));
4499         t1->doff = sizeof(*t1)/4;
4500         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4501 
4502          /*     Send it and free it.
4503           *     This will prevent the timer from automatically being restarted.
4504           */
4505         sk->prot->queue_xmit(sk, dev, buff, 1);
4506         tcp_statistics.TcpOutSegs++;
4507 }
4508 
4509 /*
4510  *      A window probe timeout has occurred.
4511  */
4512 
4513 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4514 {
4515         if (sk->zapped)
4516                 return;         /* After a valid reset we can send no more */
4517 
4518         tcp_write_wakeup(sk);
4519 
4520         sk->backoff++;
4521         sk->rto = min(sk->rto << 1, 120*HZ);
4522         reset_timer (sk, TIME_PROBE0, sk->rto);
4523         sk->retransmits++;
4524         sk->prot->retransmits ++;
4525 }
4526 
4527 /*
4528  *      Socket option code for TCP. 
4529  */
4530   
4531 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
4532 {
4533         int val,err;
4534 
4535         if(level!=SOL_TCP)
4536                 return ip_setsockopt(sk,level,optname,optval,optlen);
4537 
4538         if (optval == NULL) 
4539                 return(-EINVAL);
4540 
4541         err=verify_area(VERIFY_READ, optval, sizeof(int));
4542         if(err)
4543                 return err;
4544         
4545         val = get_fs_long((unsigned long *)optval);
4546 
4547         switch(optname)
4548         {
4549                 case TCP_MAXSEG:
4550 /*
4551  * values greater than interface MTU won't take effect.  however at
4552  * the point when this call is done we typically don't yet know
4553  * which interface is going to be used
4554  */
4555                         if(val<1||val>MAX_WINDOW)
4556                                 return -EINVAL;
4557                         sk->user_mss=val;
4558                         return 0;
4559                 case TCP_NODELAY:
4560                         sk->nonagle=(val==0)?0:1;
4561                         return 0;
4562                 default:
4563                         return(-ENOPROTOOPT);
4564         }
4565 }
4566 
4567 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
4568 {
4569         int val,err;
4570 
4571         if(level!=SOL_TCP)
4572                 return ip_getsockopt(sk,level,optname,optval,optlen);
4573                         
4574         switch(optname)
4575         {
4576                 case TCP_MAXSEG:
4577                         val=sk->user_mss;
4578                         break;
4579                 case TCP_NODELAY:
4580                         val=sk->nonagle;
4581                         break;
4582                 default:
4583                         return(-ENOPROTOOPT);
4584         }
4585         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
4586         if(err)
4587                 return err;
4588         put_fs_long(sizeof(int),(unsigned long *) optlen);
4589 
4590         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
4591         if(err)
4592                 return err;
4593         put_fs_long(val,(unsigned long *)optval);
4594 
4595         return(0);
4596 }       
4597 
4598 
4599 struct proto tcp_prot = {
4600         sock_wmalloc,
4601         sock_rmalloc,
4602         sock_wfree,
4603         sock_rfree,
4604         sock_rspace,
4605         sock_wspace,
4606         tcp_close,
4607         tcp_read,
4608         tcp_write,
4609         tcp_sendto,
4610         tcp_recvfrom,
4611         ip_build_header,
4612         tcp_connect,
4613         tcp_accept,
4614         ip_queue_xmit,
4615         tcp_retransmit,
4616         tcp_write_wakeup,
4617         tcp_read_wakeup,
4618         tcp_rcv,
4619         tcp_select,
4620         tcp_ioctl,
4621         NULL,
4622         tcp_shutdown,
4623         tcp_setsockopt,
4624         tcp_getsockopt,
4625         128,
4626         0,
4627         {NULL,},
4628         "TCP"
4629 };

/* [previous][next][first][last][top][bottom][index][help] */