net/ipv4/tcp

/* */
This source file includes following definitions.
tcp_delack_estimator
tcp_rtt_estimator
tcp_cache_zap
get_tcp_sock
bad_tcp_sequence
tcp_sequence
tcp_reset
tcp_options
tcp_conn_request
tcp_window_shrunk
tcp_ack
tcp_fin
tcp_queue_ack
tcp_queue
tcp_data
tcp_check_urg
tcp_urg
tcp_rcv
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp_input.c 1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * FIXES
  23  *              Pedro Roque     :       Double ACK bug
  24  */
  25 
  26 #include <linux/config.h>
  27 #include <net/tcp.h>
  28 
  29 /*
  30  *      Policy code extracted so its now seperate
  31  */
  32 
  33 /*
  34  *      Called each time to estimate the delayed ack timeout. This is
  35  *      how it should be done so a fast link isnt impacted by ack delay.
  36  */
  37  
  38 extern __inline__ void tcp_delack_estimator(struct sock *sk)
     /*  */
  39 {
  40         /*
  41          *      Delayed ACK time estimator.
  42          */
  43         
  44         if (sk->lrcvtime == 0) 
  45         {
  46                 sk->lrcvtime = jiffies;
  47                 sk->ato = HZ/3;
  48         }
  49         else 
  50         {
  51                 int m;
  52                 
  53                 m = jiffies - sk->lrcvtime;
  54 
  55                 sk->lrcvtime = jiffies;
  56 
  57                 if (m <= 0)
  58                         m = 1;
  59 
  60                 if (m > (sk->rtt >> 3)) 
  61                 {
  62                         sk->ato = sk->rtt >> 3;
  63                         /*
  64                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
  65                          */
  66                 }
  67                 else 
  68                 {
  69                         sk->ato = (sk->ato >> 1) + m;
  70                         /*
  71                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
  72                          */
  73                 }
  74         }
  75 }
  76 
  77 /*
  78  *      Called on frames that were known _not_ to have been
  79  *      retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
  80  *      The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
  81  */
  82  
  83 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
     /*  */
  84 {
  85         long m;
  86         /*
  87          *      The following amusing code comes from Jacobson's
  88          *      article in SIGCOMM '88.  Note that rtt and mdev
  89          *      are scaled versions of rtt and mean deviation.
  90          *      This is designed to be as fast as possible 
  91          *      m stands for "measurement".
  92          */
  93         
  94         m = jiffies - oskb->when;  /* RTT */
  95         if(m<=0)
  96                 m=1;            /* IS THIS RIGHT FOR <0 ??? */
  97         m -= (sk->rtt >> 3);    /* m is now error in rtt est */
  98         sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
  99         if (m < 0)
 100                 m = -m;         /* m is now abs(error) */
 101         m -= (sk->mdev >> 2);   /* similar update on mdev */
 102         sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 103 
 104         /*
 105          *      Now update timeout.  Note that this removes any backoff.
 106          */
 107                          
 108         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 109         if (sk->rto > 120*HZ)
 110                 sk->rto = 120*HZ;
 111         if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
 112                 sk->rto = HZ/5;
 113         sk->backoff = 0;
 114 }
 115 
 116 /*
 117  *      Cached last hit socket
 118  */
 119  
 120 static volatile unsigned long   th_cache_saddr, th_cache_daddr;
 121 static volatile unsigned short  th_cache_dport, th_cache_sport;
 122 static volatile struct sock *th_cache_sk;
 123 
 124 void tcp_cache_zap(void)
     /*  */
 125 {
 126         th_cache_sk=NULL;
 127 }
 128 
 129 /*
 130  *      Find the socket, using the last hit cache if applicable. The cache is not quite
 131  *      right...
 132  */
 133 
 134 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
     /*  */
 135 {
 136         struct sock * sk;
 137 
 138         sk = (struct sock *) th_cache_sk;
 139         if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
 140             sport != th_cache_sport || dport != th_cache_dport) {
 141                 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
 142                 if (sk) {
 143                         th_cache_saddr=saddr;
 144                         th_cache_daddr=daddr;
 145                         th_cache_dport=dport;
 146                         th_cache_sport=sport;
 147                         th_cache_sk=sk;
 148                 }
 149         }
 150         return sk;
 151 }
 152 
 153 /*
 154  * React to a out-of-window TCP sequence number in an incoming packet
 155  */
 156  
 157 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
 158              struct options *opt, unsigned long saddr, struct device *dev)
 159 {
 160         if (th->rst)
 161                 return;
 162 
 163         /*
 164          *      Send a reset if we get something not ours and we are
 165          *      unsynchronized. Note: We don't do anything to our end. We
 166          *      are just killing the bogus remote connection then we will
 167          *      connect again and it will work (with luck).
 168          */
 169          
 170         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
 171         {
 172                 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
 173                 return;
 174         }
 175         
 176         /*
 177          *      4.3reno machines look for these kind of acks so they can do fast
 178          *      recovery. Three identical 'old' acks lets it know that one frame has
 179          *      been lost and should be resent. Because this is before the whole window
 180          *      of data has timed out it can take one lost frame per window without
 181          *      stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
 182          *
 183          *      We also should be spotting triple bad sequences.
 184          */
 185         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
 186         return;
 187 }
 188 
 189 /*
 190  *      This functions checks to see if the tcp header is actually acceptable. 
 191  */
 192  
 193 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
     /*  */
 194 {
 195         u32 end_window = sk->acked_seq + sk->window;
 196         return  /* if start is at end of window, end must be too (zero window) */
 197                 (seq == end_window && seq == end_seq) ||
 198                 /* if start is before end of window, check for interest */
 199                 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
 200 }
 201 
 202 /*
 203  *      When we get a reset we do this. This probably is a tcp_output routine
 204  *      really.
 205  */
 206 
 207 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
 208 {
 209         sk->zapped = 1;
 210         /*
 211          *      We want the right error as BSD sees it (and indeed as we do).
 212          */
 213         sk->err = ECONNRESET;
 214         if (sk->state == TCP_SYN_SENT)
 215                 sk->err = ECONNREFUSED;
 216         if (sk->state == TCP_CLOSE_WAIT)
 217                 sk->err = EPIPE;
 218 #ifdef CONFIG_TCP_RFC1337
 219         /*
 220          *      Time wait assassination protection [RFC1337]
 221          *
 222          *      This is a good idea, but causes more sockets to take time to close.
 223          *
 224          *      Ian Heavens has since shown this is an inadequate fix for the protocol
 225          *      bug in question.
 226          */
 227         if(sk->state!=TCP_TIME_WAIT)
 228         {       
 229                 tcp_set_state(sk,TCP_CLOSE);
 230                 sk->shutdown = SHUTDOWN_MASK;
 231         }
 232 #else   
 233         tcp_set_state(sk,TCP_CLOSE);
 234         sk->shutdown = SHUTDOWN_MASK;
 235 #endif  
 236         if (!sk->dead) 
 237                 sk->state_change(sk);
 238         kfree_skb(skb, FREE_READ);
 239         return(0);
 240 }
 241 
 242 
 243 /*
 244  *      Look for tcp options. Parses everything but only knows about MSS.
 245  *      This routine is always called with the packet containing the SYN.
 246  *      However it may also be called with the ack to the SYN.  So you
 247  *      can't assume this is always the SYN.  It's always called after
 248  *      we have set up sk->mtu to our own MTU.
 249  *
 250  *      We need at minimum to add PAWS support here. Possibly large windows
 251  *      as Linux gets deployed on 100Mb/sec networks.
 252  */
 253  
 254 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
 255 {
 256         unsigned char *ptr;
 257         int length=(th->doff*4)-sizeof(struct tcphdr);
 258         int mss_seen = 0;
 259     
 260         ptr = (unsigned char *)(th + 1);
 261   
 262         while(length>0)
 263         {
 264                 int opcode=*ptr++;
 265                 int opsize=*ptr++;
 266                 switch(opcode)
 267                 {
 268                         case TCPOPT_EOL:
 269                                 return;
 270                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 271                                 length--;
 272                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
 273                                 continue;
 274                         
 275                         default:
 276                                 if(opsize<=2)   /* Avoid silly options looping forever */
 277                                         return;
 278                                 switch(opcode)
 279                                 {
 280                                         case TCPOPT_MSS:
 281                                                 if(opsize==4 && th->syn)
 282                                                 {
 283                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
 284                                                         mss_seen = 1;
 285                                                 }
 286                                                 break;
 287                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
 288                                 }
 289                                 ptr+=opsize-2;
 290                                 length-=opsize;
 291                 }
 292         }
 293         if (th->syn) 
 294         {
 295                 if (! mss_seen)
 296                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
 297         }
 298 #ifdef CONFIG_INET_PCTCP
 299         sk->mss = min(sk->max_window >> 1, sk->mtu);
 300 #else    
 301         sk->mss = min(sk->max_window, sk->mtu);
 302         sk->max_unacked = 2 * sk->mss;
 303 #endif  
 304 }
 305 
 306 
 307 /*
 308  *      This routine handles a connection request.
 309  *      It should make sure we haven't already responded.
 310  *      Because of the way BSD works, we have to send a syn/ack now.
 311  *      This also means it will be harder to close a socket which is
 312  *      listening.
 313  */
 314  
 315 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
 316                  u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
 317 {
 318         struct sock *newsk;
 319         struct tcphdr *th;
 320         struct rtable *rt;
 321   
 322         th = skb->h.th;
 323 
 324         /* If the socket is dead, don't accept the connection. */
 325         if (!sk->dead) 
 326         {
 327                 sk->data_ready(sk,0);
 328         }
 329         else 
 330         {
 331                 if(sk->debug)
 332                         printk("Reset on %p: Connect on dead socket.\n",sk);
 333                 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
 334                 tcp_statistics.TcpAttemptFails++;
 335                 kfree_skb(skb, FREE_READ);
 336                 return;
 337         }
 338 
 339         /*
 340          *      Make sure we can accept more.  This will prevent a
 341          *      flurry of syns from eating up all our memory.
 342          *
 343          *      BSD does some funnies here and allows 3/2 times the
 344          *      set backlog as a fudge factor. Thats just too gross.
 345          */
 346 
 347         if (sk->ack_backlog >= sk->max_ack_backlog) 
 348         {
 349                 tcp_statistics.TcpAttemptFails++;
 350                 kfree_skb(skb, FREE_READ);
 351                 return;
 352         }
 353 
 354         /*
 355          * We need to build a new sock struct.
 356          * It is sort of bad to have a socket without an inode attached
 357          * to it, but the wake_up's will just wake up the listening socket,
 358          * and if the listening socket is destroyed before this is taken
 359          * off of the queue, this will take care of it.
 360          */
 361 
 362         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
 363         if (newsk == NULL) 
 364         {
 365                 /* just ignore the syn.  It will get retransmitted. */
 366                 tcp_statistics.TcpAttemptFails++;
 367                 kfree_skb(skb, FREE_READ);
 368                 return;
 369         }
 370 
 371         memcpy(newsk, sk, sizeof(*newsk));
 372         newsk->opt = NULL;
 373         newsk->ip_route_cache  = NULL;
 374         if (opt && opt->optlen) 
 375         {
 376                 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
 377                 if (!sk->opt) 
 378                 {
 379                         kfree_s(newsk, sizeof(struct sock));
 380                         tcp_statistics.TcpAttemptFails++;
 381                         kfree_skb(skb, FREE_READ);
 382                         return;
 383                 }
 384                 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
 385                 {
 386                         kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
 387                         kfree_s(newsk, sizeof(struct sock));
 388                         tcp_statistics.TcpAttemptFails++;
 389                         kfree_skb(skb, FREE_READ);
 390                         return;
 391                 }
 392         }
 393         skb_queue_head_init(&newsk->write_queue);
 394         skb_queue_head_init(&newsk->receive_queue);
 395         newsk->send_head = NULL;
 396         newsk->send_tail = NULL;
 397         skb_queue_head_init(&newsk->back_log);
 398         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
 399         newsk->ato = HZ/3;
 400         newsk->rto = TCP_TIMEOUT_INIT;
 401         newsk->mdev = 0;
 402         newsk->max_window = 0;
 403         newsk->cong_window = 1;
 404         newsk->cong_count = 0;
 405         newsk->ssthresh = 0;
 406         newsk->backoff = 0;
 407         newsk->blog = 0;
 408         newsk->intr = 0;
 409         newsk->proc = 0;
 410         newsk->done = 0;
 411         newsk->partial = NULL;
 412         newsk->pair = NULL;
 413         newsk->wmem_alloc = 0;
 414         newsk->rmem_alloc = 0;
 415         newsk->localroute = sk->localroute;
 416 
 417         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
 418 
 419         newsk->err = 0;
 420         newsk->shutdown = 0;
 421         newsk->ack_backlog = 0;
 422         newsk->acked_seq = skb->seq+1;
 423         newsk->lastwin_seq = skb->seq+1;
 424         newsk->delay_acks = 1;
 425         newsk->copied_seq = skb->seq+1;
 426         newsk->fin_seq = skb->seq;
 427         newsk->state = TCP_SYN_RECV;
 428         newsk->timeout = 0;
 429         newsk->ip_xmit_timeout = 0;
 430         newsk->write_seq = seq; 
 431         newsk->window_seq = newsk->write_seq;
 432         newsk->rcv_ack_seq = newsk->write_seq;
 433         newsk->urg_data = 0;
 434         newsk->retransmits = 0;
 435         newsk->linger=0;
 436         newsk->destroy = 0;
 437         init_timer(&newsk->timer);
 438         newsk->timer.data = (unsigned long)newsk;
 439         newsk->timer.function = &net_timer;
 440         init_timer(&newsk->retransmit_timer);
 441         newsk->retransmit_timer.data = (unsigned long)newsk;
 442         newsk->retransmit_timer.function=&tcp_retransmit_timer;
 443         newsk->dummy_th.source = skb->h.th->dest;
 444         newsk->dummy_th.dest = skb->h.th->source;
 445         
 446         /*
 447          *      Swap these two, they are from our point of view. 
 448          */
 449          
 450         newsk->daddr = saddr;
 451         newsk->saddr = daddr;
 452         newsk->rcv_saddr = daddr;
 453 
 454         put_sock(newsk->num,newsk);
 455         newsk->acked_seq = skb->seq + 1;
 456         newsk->copied_seq = skb->seq + 1;
 457         newsk->socket = NULL;
 458 
 459         /*
 460          *      Grab the ttl and tos values and use them 
 461          */
 462 
 463         newsk->ip_ttl=sk->ip_ttl;
 464         newsk->ip_tos=skb->ip_hdr->tos;
 465 
 466         /*
 467          *      Use 512 or whatever user asked for 
 468          */
 469 
 470         /*
 471          *      Note use of sk->user_mss, since user has no direct access to newsk 
 472          */
 473 
 474         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
 475         newsk->ip_route_cache = rt;
 476         
 477         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
 478                 newsk->window_clamp = rt->rt_window;
 479         else
 480                 newsk->window_clamp = 0;
 481                 
 482         if (sk->user_mss)
 483                 newsk->mtu = sk->user_mss;
 484         else if (rt)
 485                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 486         else 
 487                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
 488 
 489         /*
 490          *      But not bigger than device MTU 
 491          */
 492 
 493         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
 494 
 495 #ifdef CONFIG_SKIP
 496         
 497         /*
 498          *      SKIP devices set their MTU to 65535. This is so they can take packets
 499          *      unfragmented to security process then fragment. They could lie to the
 500          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
 501          *      simply because the final package we want unfragmented is going to be
 502          *
 503          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
 504          */
 505          
 506         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
 507                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
 508 #endif
 509         /*
 510          *      This will min with what arrived in the packet 
 511          */
 512 
 513         tcp_options(newsk,skb->h.th);
 514         
 515         tcp_cache_zap();
 516         tcp_send_synack(newsk, sk, skb);
 517 }
 518 
 519 
 520 /*
 521  * Handle a TCP window that shrunk on us. It shouldn't happen,
 522  * but..
 523  *
 524  * We may need to move packets from the send queue
 525  * to the write queue, if the window has been shrunk on us.
 526  * The RFC says you are not allowed to shrink your window
 527  * like this, but if the other end does, you must be able
 528  * to deal with it.
 529  */
 530 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
     /*  */
 531 {
 532         struct sk_buff *skb;
 533         struct sk_buff *skb2;
 534         struct sk_buff *wskb = NULL;
 535         
 536         skb2 = sk->send_head;
 537         sk->send_head = NULL;
 538         sk->send_tail = NULL;
 539 
 540         /*
 541          *      This is an artifact of a flawed concept. We want one
 542          *      queue and a smarter send routine when we send all.
 543          */
 544         cli();
 545         while (skb2 != NULL) 
 546         {
 547                 skb = skb2;
 548                 skb2 = skb->link3;
 549                 skb->link3 = NULL;
 550                 if (after(skb->end_seq, window_seq)) 
 551                 {
 552                         if (sk->packets_out > 0) 
 553                                 sk->packets_out--;
 554                         /* We may need to remove this from the dev send list. */
 555                         if (skb->next != NULL) 
 556                         {
 557                                 skb_unlink(skb);                                
 558                         }
 559                         /* Now add it to the write_queue. */
 560                         if (wskb == NULL)
 561                                 skb_queue_head(&sk->write_queue,skb);
 562                         else
 563                                 skb_append(wskb,skb);
 564                         wskb = skb;
 565                 } 
 566                 else 
 567                 {
 568                         if (sk->send_head == NULL) 
 569                         {
 570                                 sk->send_head = skb;
 571                                 sk->send_tail = skb;
 572                         }
 573                         else
 574                         {
 575                                 sk->send_tail->link3 = skb;
 576                                 sk->send_tail = skb;
 577                         }
 578                         skb->link3 = NULL;
 579                 }
 580         }
 581         sti();
 582 }
 583 
 584 
 585 /*
 586  *      This routine deals with incoming acks, but not outgoing ones.
 587  *
 588  *      This routine is totally _WRONG_. The list structuring is wrong,
 589  *      the algorithm is wrong, the code is wrong.
 590  */
 591 
 592 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
     /*  */
 593 {
 594         int flag = 0;
 595         u32 window_seq;
 596 
 597         /* 
 598          * 1 - there was data in packet as well as ack or new data is sent or 
 599          *     in shutdown state
 600          * 2 - data from retransmit queue was acked and removed
 601          * 4 - window shrunk or data from retransmit queue was acked and removed
 602          */
 603 
 604         if(sk->zapped)
 605                 return(1);      /* Dead, cant ack any more so why bother */
 606 
 607         /*
 608          *      We have dropped back to keepalive timeouts. Thus we have
 609          *      no retransmits pending.
 610          */
 611          
 612         if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
 613                 sk->retransmits = 0;
 614 
 615         /*
 616          *      If the ack is newer than sent or older than previous acks
 617          *      then we can probably ignore it.
 618          */
 619          
 620         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
 621                 goto uninteresting_ack;
 622 
 623         /*
 624          *      If there is data set flag 1
 625          */
 626          
 627         if (len != th->doff*4) 
 628                 flag |= 1;
 629 
 630         /*
 631          *      Have we discovered a larger window
 632          */
 633         window_seq = ntohs(th->window);
 634         if (window_seq > sk->max_window) 
 635         {
 636                 sk->max_window = window_seq;
 637 #ifdef CONFIG_INET_PCTCP
 638                 /* Hack because we don't send partial packets to non SWS
 639                    handling hosts */
 640                 sk->mss = min(window_seq>>1, sk->mtu);
 641 #else
 642                 sk->mss = min(window_seq, sk->mtu);
 643 #endif  
 644         }
 645         window_seq += ack;
 646 
 647         /*
 648          *      See if our window has been shrunk. 
 649          */
 650         if (after(sk->window_seq, window_seq)) {
 651                 flag |= 4;
 652                 tcp_window_shrunk(sk, window_seq);
 653         }
 654 
 655         /*
 656          *      Update the right hand window edge of the host
 657          */
 658         sk->window_seq = window_seq;
 659 
 660         /*
 661          *      Pipe has emptied
 662          */      
 663         if (sk->send_tail == NULL || sk->send_head == NULL) 
 664         {
 665                 sk->send_head = NULL;
 666                 sk->send_tail = NULL;
 667                 sk->packets_out= 0;
 668         }
 669 
 670         /*
 671          *      We don't want too many packets out there. 
 672          */
 673          
 674         if (sk->ip_xmit_timeout == TIME_WRITE && 
 675                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
 676         {
 677                 
 678                 /* 
 679                  * This is Jacobson's slow start and congestion avoidance. 
 680                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
 681                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
 682                  * counter and increment it once every cwnd times.  It's possible
 683                  * that this should be done only if sk->retransmits == 0.  I'm
 684                  * interpreting "new data is acked" as including data that has
 685                  * been retransmitted but is just now being acked.
 686                  */
 687                 if (sk->cong_window < sk->ssthresh)  
 688                         /* 
 689                          *      In "safe" area, increase
 690                          */
 691                         sk->cong_window++;
 692                 else 
 693                 {
 694                         /*
 695                          *      In dangerous area, increase slowly.  In theory this is
 696                          *      sk->cong_window += 1 / sk->cong_window
 697                          */
 698                         if (sk->cong_count >= sk->cong_window) 
 699                         {
 700                                 sk->cong_window++;
 701                                 sk->cong_count = 0;
 702                         }
 703                         else 
 704                                 sk->cong_count++;
 705                 }
 706         }
 707 
 708         /*
 709          *      Remember the highest ack received.
 710          */
 711          
 712         sk->rcv_ack_seq = ack;
 713         
 714         /*
 715          *      We passed data and got it acked, remove any soft error
 716          *      log. Something worked...
 717          */
 718          
 719         sk->err_soft = 0;
 720 
 721         /*
 722          *      If this ack opens up a zero window, clear backoff.  It was
 723          *      being used to time the probes, and is probably far higher than
 724          *      it needs to be for normal retransmission.
 725          */
 726 
 727         if (sk->ip_xmit_timeout == TIME_PROBE0) 
 728         {
 729                 sk->retransmits = 0;    /* Our probe was answered */
 730                 
 731                 /*
 732                  *      Was it a usable window open ?
 733                  */
 734                  
 735                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
 736                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
 737                 {
 738                         sk->backoff = 0;
 739                         
 740                         /*
 741                          *      Recompute rto from rtt.  this eliminates any backoff.
 742                          */
 743 
 744                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 745                         if (sk->rto > 120*HZ)
 746                                 sk->rto = 120*HZ;
 747                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
 748                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
 749                                                    .2 of a second is going to need huge windows (SIGH) */
 750                         sk->rto = HZ/5;
 751                 }
 752         }
 753 
 754         /* 
 755          *      See if we can take anything off of the retransmit queue.
 756          */
 757    
 758         while(sk->send_head != NULL) 
 759         {
 760                 /* Check for a bug. */
 761                 if (sk->send_head->link3 &&
 762                     after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) 
 763                         printk("INET: tcp.c: *** bug send_list out of order.\n");
 764                         
 765                 /*
 766                  *      If our packet is before the ack sequence we can
 767                  *      discard it as it's confirmed to have arrived the other end.
 768                  */
 769                  
 770                 if (before(sk->send_head->end_seq, ack+1)) 
 771                 {
 772                         struct sk_buff *oskb;   
 773                         if (sk->retransmits) 
 774                         {       
 775                                 /*
 776                                  *      We were retransmitting.  don't count this in RTT est 
 777                                  */
 778                                 flag |= 2;
 779 
 780                                 /*
 781                                  * even though we've gotten an ack, we're still
 782                                  * retransmitting as long as we're sending from
 783                                  * the retransmit queue.  Keeping retransmits non-zero
 784                                  * prevents us from getting new data interspersed with
 785                                  * retransmissions.
 786                                  */
 787 
 788                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
 789                                         sk->retransmits = 1;
 790                                 else
 791                                         sk->retransmits = 0;
 792                         }
 793                         /*
 794                          * Note that we only reset backoff and rto in the
 795                          * rtt recomputation code.  And that doesn't happen
 796                          * if there were retransmissions in effect.  So the
 797                          * first new packet after the retransmissions is
 798                          * sent with the backoff still in effect.  Not until
 799                          * we get an ack from a non-retransmitted packet do
 800                          * we reset the backoff and rto.  This allows us to deal
 801                          * with a situation where the network delay has increased
 802                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 803                          */
 804 
 805                         /*
 806                          *      We have one less packet out there. 
 807                          */
 808                          
 809                         if (sk->packets_out > 0) 
 810                                 sk->packets_out --;
 811 
 812                         oskb = sk->send_head;
 813 
 814                         if (!(flag&2))  /* Not retransmitting */
 815                                 tcp_rtt_estimator(sk,oskb);
 816                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
 817                                            In this case as we just set it up */
 818                         cli();
 819                         oskb = sk->send_head;
 820                         IS_SKB(oskb);
 821                         sk->send_head = oskb->link3;
 822                         if (sk->send_head == NULL) 
 823                         {
 824                                 sk->send_tail = NULL;
 825                         }
 826 
 827                 /*
 828                  *      We may need to remove this from the dev send list. 
 829                  */
 830 
 831                         if (oskb->next)
 832                                 skb_unlink(oskb);
 833                         sti();
 834                         kfree_skb(oskb, FREE_WRITE); /* write. */
 835                         if (!sk->dead)
 836                                 sk->write_space(sk);
 837                 }
 838                 else
 839                 {
 840                         break;
 841                 }
 842         }
 843 
 844         /*
 845          * XXX someone ought to look at this too.. at the moment, if skb_peek()
 846          * returns non-NULL, we complete ignore the timer stuff in the else
 847          * clause.  We ought to organize the code so that else clause can
 848          * (should) be executed regardless, possibly moving the PROBE timer
 849          * reset over.  The skb_peek() thing should only move stuff to the
 850          * write queue, NOT also manage the timer functions.
 851          */
 852 
 853         /*
 854          * Maybe we can take some stuff off of the write queue,
 855          * and put it onto the xmit queue.
 856          */
 857         if (skb_peek(&sk->write_queue) != NULL) 
 858         {
 859                 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
 860                         (sk->retransmits == 0 || 
 861                          sk->ip_xmit_timeout != TIME_WRITE ||
 862                          before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
 863                         && sk->packets_out < sk->cong_window) 
 864                 {
 865                         /*
 866                          *      Add more data to the send queue.
 867                          */
 868                         flag |= 1;
 869                         tcp_write_xmit(sk);
 870                 }
 871                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
 872                         sk->send_head == NULL &&
 873                         sk->ack_backlog == 0 &&
 874                         sk->state != TCP_TIME_WAIT) 
 875                 {
 876                         /*
 877                          *      Data to queue but no room.
 878                          */
 879                         tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
 880                 }               
 881         }
 882         else
 883         {
 884                 /*
 885                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
 886                  * from TCP_CLOSE we don't do anything
 887                  *
 888                  * from anything else, if there is write data (or fin) pending,
 889                  * we use a TIME_WRITE timeout, else if keepalive we reset to
 890                  * a KEEPALIVE timeout, else we delete the timer.
 891                  *
 892                  * We do not set flag for nominal write data, otherwise we may
 893                  * force a state where we start to write itsy bitsy tidbits
 894                  * of data.
 895                  */
 896 
 897                 switch(sk->state) {
 898                 case TCP_TIME_WAIT:
 899                         /*
 900                          * keep us in TIME_WAIT until we stop getting packets,
 901                          * reset the timeout.
 902                          */
 903                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 904                         break;
 905                 case TCP_CLOSE:
 906                         /*
 907                          * don't touch the timer.
 908                          */
 909                         break;
 910                 default:
 911                         /*
 912                          *      Must check send_head, write_queue, and ack_backlog
 913                          *      to determine which timeout to use.
 914                          */
 915                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
 916                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 917                         } else if (sk->keepopen) {
 918                                 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 919                         } else {
 920                                 del_timer(&sk->retransmit_timer);
 921                                 sk->ip_xmit_timeout = 0;
 922                         }
 923                         break;
 924                 }
 925         }
 926 
 927         /*
 928          *      We have nothing queued but space to send. Send any partial
 929          *      packets immediately (end of Nagle rule application).
 930          */
 931          
 932         if (sk->packets_out == 0 && sk->partial != NULL &&
 933                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
 934         {
 935                 flag |= 1;
 936                 tcp_send_partial(sk);
 937         }
 938 
 939         /*
 940          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
 941          * we are now waiting for an acknowledge to our FIN.  The other end is
 942          * already in TIME_WAIT.
 943          *
 944          * Move to TCP_CLOSE on success.
 945          */
 946 
 947         if (sk->state == TCP_LAST_ACK) 
 948         {
 949                 if (!sk->dead)
 950                         sk->state_change(sk);
 951                 if(sk->debug)
 952                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
 953                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
 954                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
 955                 {
 956                         flag |= 1;
 957                         sk->shutdown = SHUTDOWN_MASK;
 958                         tcp_set_state(sk,TCP_CLOSE);
 959                         return 1;
 960                 }
 961         }
 962 
 963         /*
 964          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
 965          *
 966          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
 967          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
 968          */
 969 
 970         if (sk->state == TCP_FIN_WAIT1) 
 971         {
 972 
 973                 if (!sk->dead) 
 974                         sk->state_change(sk);
 975                 if (sk->rcv_ack_seq == sk->write_seq) 
 976                 {
 977                         flag |= 1;
 978                         sk->shutdown |= SEND_SHUTDOWN;
 979                         tcp_set_state(sk, TCP_FIN_WAIT2);
 980                 }
 981         }
 982 
 983         /*
 984          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
 985          *
 986          *      Move to TIME_WAIT
 987          */
 988 
 989         if (sk->state == TCP_CLOSING) 
 990         {
 991 
 992                 if (!sk->dead) 
 993                         sk->state_change(sk);
 994                 if (sk->rcv_ack_seq == sk->write_seq) 
 995                 {
 996                         flag |= 1;
 997                         tcp_time_wait(sk);
 998                 }
 999         }
1000         
1001         /*
1002          *      Final ack of a three way shake 
1003          */
1004          
1005         if(sk->state==TCP_SYN_RECV)
1006         {
1007                 tcp_set_state(sk, TCP_ESTABLISHED);
1008                 tcp_options(sk,th);
1009                 sk->dummy_th.dest=th->source;
1010                 sk->copied_seq = sk->acked_seq;
1011                 if(!sk->dead)
1012                         sk->state_change(sk);
1013                 if(sk->max_window==0)
1014                 {
1015                         sk->max_window=32;      /* Sanity check */
1016                         sk->mss=min(sk->max_window,sk->mtu);
1017                 }
1018         }
1019         
1020         /*
1021          * I make no guarantees about the first clause in the following
1022          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
1023          * what conditions "!flag" would be true.  However I think the rest
1024          * of the conditions would prevent that from causing any
1025          * unnecessary retransmission. 
1026          *   Clearly if the first packet has expired it should be 
1027          * retransmitted.  The other alternative, "flag&2 && retransmits", is
1028          * harder to explain:  You have to look carefully at how and when the
1029          * timer is set and with what timeout.  The most recent transmission always
1030          * sets the timer.  So in general if the most recent thing has timed
1031          * out, everything before it has as well.  So we want to go ahead and
1032          * retransmit some more.  If we didn't explicitly test for this
1033          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1034          * would not be true.  If you look at the pattern of timing, you can
1035          * show that rto is increased fast enough that the next packet would
1036          * almost never be retransmitted immediately.  Then you'd end up
1037          * waiting for a timeout to send each packet on the retransmission
1038          * queue.  With my implementation of the Karn sampling algorithm,
1039          * the timeout would double each time.  The net result is that it would
1040          * take a hideous amount of time to recover from a single dropped packet.
1041          * It's possible that there should also be a test for TIME_WRITE, but
1042          * I think as long as "send_head != NULL" and "retransmit" is on, we've
1043          * got to be in real retransmission mode.
1044          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
1045          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1046          * As long as no further losses occur, this seems reasonable.
1047          */
1048         
1049         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1050                (((flag&2) && sk->retransmits) ||
1051                (sk->send_head->when + sk->rto < jiffies))) 
1052         {
1053                 if(sk->send_head->when + sk->rto < jiffies)
1054                         tcp_retransmit(sk,0);   
1055                 else
1056                 {
1057                         tcp_do_retransmit(sk, 1);
1058                         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1059                 }
1060         }
1061 
1062         return 1;
1063 
1064 uninteresting_ack:
1065         if(sk->debug)
1066                 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1067                         
1068         /*
1069          *      Keepalive processing.
1070          */
1071                  
1072         if (after(ack, sk->sent_seq)) 
1073         {
1074                 return 0;
1075         }
1076                 
1077         /*
1078          *      Restart the keepalive timer.
1079          */
1080                  
1081         if (sk->keepopen) 
1082         {
1083                 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1084                         tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1085         }
1086         return 1;
1087 }
1088 
1089 
1090 /*
1091  *      Process the FIN bit. This now behaves as it is supposed to work
1092  *      and the FIN takes effect when it is validly part of sequence
1093  *      space. Not before when we get holes.
1094  *
1095  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1096  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1097  *      TIME-WAIT)
1098  *
1099  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1100  *      close and we go into CLOSING (and later onto TIME-WAIT)
1101  *
1102  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1103  *
1104  */
1105  
1106 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
1107 {
1108         sk->fin_seq = skb->end_seq;
1109 
1110         if (!sk->dead) 
1111         {
1112                 sk->state_change(sk);
1113                 sock_wake_async(sk->socket, 1);
1114         }
1115 
1116         switch(sk->state) 
1117         {
1118                 case TCP_SYN_RECV:
1119                 case TCP_SYN_SENT:
1120                 case TCP_ESTABLISHED:
1121                         /*
1122                          * move to CLOSE_WAIT, tcp_data() already handled
1123                          * sending the ack.
1124                          */
1125                         tcp_set_state(sk,TCP_CLOSE_WAIT);
1126                         if (th->rst)
1127                                 sk->shutdown = SHUTDOWN_MASK;
1128                         break;
1129 
1130                 case TCP_CLOSE_WAIT:
1131                 case TCP_CLOSING:
1132                         /*
1133                          * received a retransmission of the FIN, do
1134                          * nothing.
1135                          */
1136                         break;
1137                 case TCP_TIME_WAIT:
1138                         /*
1139                          * received a retransmission of the FIN,
1140                          * restart the TIME_WAIT timer.
1141                          */
1142                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1143                         return(0);
1144                 case TCP_FIN_WAIT1:
1145                         /*
1146                          * This case occurs when a simultaneous close
1147                          * happens, we must ack the received FIN and
1148                          * enter the CLOSING state.
1149                          *
1150                          * This causes a WRITE timeout, which will either
1151                          * move on to TIME_WAIT when we timeout, or resend
1152                          * the FIN properly (maybe we get rid of that annoying
1153                          * FIN lost hang). The TIME_WRITE code is already correct
1154                          * for handling this timeout.
1155                          */
1156 
1157                         if(sk->ip_xmit_timeout != TIME_WRITE)
1158                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1159                         tcp_set_state(sk,TCP_CLOSING);
1160                         break;
1161                 case TCP_FIN_WAIT2:
1162                         /*
1163                          * received a FIN -- send ACK and enter TIME_WAIT
1164                          */
1165                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1166                         sk->shutdown|=SHUTDOWN_MASK;
1167                         tcp_set_state(sk,TCP_TIME_WAIT);
1168                         break;
1169                 case TCP_CLOSE:
1170                         /*
1171                          * already in CLOSE
1172                          */
1173                         break;
1174                 default:
1175                         tcp_set_state(sk,TCP_LAST_ACK);
1176         
1177                         /* Start the timers. */
1178                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1179                         return(0);
1180         }
1181 
1182         return(0);
1183 }
1184 
1185 /*
1186  * Called for each packet when we find a new ACK endpoint sequence in it
1187  */
1188 static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
     /*  */
1189 {
1190         /*
1191          *      When we ack the fin, we do the FIN 
1192          *      processing.
1193          */
1194         skb->acked = 1;
1195         if (skb->h.th->fin)
1196                 tcp_fin(skb,sk,skb->h.th);
1197         return skb->end_seq;
1198 }
1199           
1200 
1201 /*
1202  * Add a sk_buff to the TCP receive queue, calculating
1203  * the ACK sequence as we go..
1204  */
1205 static void tcp_queue(struct sk_buff * skb, struct sock * sk,
     /*  */
1206         struct tcphdr *th, unsigned long saddr)
1207 {
1208         struct sk_buff_head * list = &sk->receive_queue;
1209         struct sk_buff * next;
1210         u32 ack_seq;
1211 
1212         /*
1213          * Find where the new skb goes.. (This goes backwards,
1214          * on the assumption that we get the packets in order)
1215          */
1216         next = list->prev;
1217         while (next != (struct sk_buff *) list) {
1218                 if (!after(next->seq, skb->seq))
1219                         break;
1220                 next = next->prev;
1221         }
1222         /*
1223          * put it after the packet we found (which
1224          * may be the list-head, but that's fine).
1225          */
1226         __skb_append(next, skb, list);
1227         next = skb->next;
1228 
1229         /*
1230          * Did we get anything new to ack?
1231          */
1232         ack_seq = sk->acked_seq;
1233         if (!after(skb->seq, ack_seq) && after(skb->end_seq, ack_seq)) {
1234                 ack_seq = tcp_queue_ack(skb, sk);
1235 
1236                 /*
1237                  * Do we have any old packets to ack that the above
1238                  * made visible? (Go forward from skb)
1239                  */
1240                 while (next != (struct sk_buff *) list) {
1241                         if (after(next->seq, ack_seq))
1242                                 break;
1243                         if (after(next->end_seq, ack_seq))
1244                                 ack_seq = tcp_queue_ack(next, sk);
1245                         next = next->next;
1246                 }
1247 
1248                 /*
1249                  * Ok, we found new data, update acked_seq as
1250                  * necessary (and possibly send the actual
1251                  * ACK packet).
1252                  *
1253                  *      rules for delaying an ack:
1254                  *      - delay time <= 0.5 HZ
1255                  *      - we don't have a window update to send
1256                  *      - must send at least every 2 full sized packets
1257                  */
1258                 sk->acked_seq = ack_seq;
1259                 if (!sk->delay_acks ||
1260                     /* sk->ack_backlog >= sk->max_ack_backlog || */
1261                     sk->bytes_rcv > sk->max_unacked || th->fin ||
1262                     sk->ato > HZ/2) {
1263                         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1264                 }
1265                 else 
1266                 {       
1267                         sk->ack_backlog++;
1268                         if(sk->debug)                           
1269                                 printk("Ack queued.\n");
1270                         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);  
1271                 }               
1272         }
1273 }
1274 
1275 
1276 /*
1277  *      This routine handles the data.  If there is room in the buffer,
1278  *      it will be have already been moved into it.  If there is no
1279  *      room, then we will just have to discard the packet.
1280  */
1281 
1282 static int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
1283          unsigned long saddr, unsigned short len)
1284 {
1285         struct tcphdr *th;
1286         u32 new_seq, shut_seq;
1287 
1288         th = skb->h.th;
1289         skb_pull(skb,th->doff*4);
1290         skb_trim(skb,len-(th->doff*4));
1291 
1292         /*
1293          *      The bytes in the receive read/assembly queue has increased. Needed for the
1294          *      low memory discard algorithm 
1295          */
1296            
1297         sk->bytes_rcv += skb->len;
1298         
1299         if (skb->len == 0 && !th->fin) 
1300         {
1301                 /* 
1302                  *      Don't want to keep passing ack's back and forth. 
1303                  *      (someone sent us dataless, boring frame)
1304                  */
1305                 if (!th->ack)
1306                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1307                 kfree_skb(skb, FREE_READ);
1308                 return(0);
1309         }
1310         
1311         /*
1312          *      We no longer have anyone receiving data on this connection.
1313          */
1314 
1315 #ifndef TCP_DONT_RST_SHUTDOWN            
1316 
1317         if(sk->shutdown & RCV_SHUTDOWN)
1318         {
1319                 /*
1320                  *      FIXME: BSD has some magic to avoid sending resets to
1321                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
1322                  *      BSD stacks still have broken keepalives so we want to
1323                  *      cope with it.
1324                  */
1325 
1326                 if(skb->len)    /* We don't care if it's just an ack or
1327                                    a keepalive/window probe */
1328                 {
1329                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
1330                         
1331                         /* Do this the way 4.4BSD treats it. Not what I'd
1332                            regard as the meaning of the spec but it's what BSD
1333                            does and clearly they know everything 8) */
1334 
1335                         /*
1336                          *      This is valid because of two things
1337                          *
1338                          *      a) The way tcp_data behaves at the bottom.
1339                          *      b) A fin takes effect when read not when received.
1340                          */
1341                          
1342                         shut_seq = sk->acked_seq+1;     /* Last byte */
1343                         
1344                         if(after(new_seq,shut_seq))
1345                         {
1346                                 if(sk->debug)
1347                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1348                                                 sk, new_seq, shut_seq, sk->blog);
1349                                 if(sk->dead)
1350                                 {
1351                                         sk->acked_seq = new_seq + th->fin;
1352                                         tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1353                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1354                                         tcp_statistics.TcpEstabResets++;
1355                                         sk->err = EPIPE;
1356                                         sk->error_report(sk);
1357                                         sk->shutdown = SHUTDOWN_MASK;
1358                                         tcp_set_state(sk,TCP_CLOSE);
1359                                         kfree_skb(skb, FREE_READ);
1360                                         return 0;
1361                                 }
1362                         }
1363                 }
1364         }
1365 
1366 #endif
1367 
1368         tcp_queue(skb, sk, th, saddr);
1369 
1370         /*
1371          *      If we've missed a packet, send an ack.
1372          *      Also start a timer to send another.
1373          */
1374          
1375         if (!skb->acked) 
1376         {
1377         
1378         /*
1379          *      This is important.  If we don't have much room left,
1380          *      we need to throw out a few packets so we have a good
1381          *      window.  Note that mtu is used, not mss, because mss is really
1382          *      for the send side.  He could be sending us stuff as large as mtu.
1383          */
1384                  
1385                 while (sock_rspace(sk) < sk->mtu) 
1386                 {
1387                         struct sk_buff * skb1 = skb_peek(&sk->receive_queue);
1388                         if (skb1 == NULL) 
1389                         {
1390                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
1391                                 break;
1392                         }
1393 
1394                         /*
1395                          *      Don't throw out something that has been acked. 
1396                          */
1397                  
1398                         if (skb1->acked) 
1399                         {
1400                                 break;
1401                         }
1402                 
1403                         skb_unlink(skb1);
1404                         kfree_skb(skb1, FREE_READ);
1405                 }
1406                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1407                 sk->ack_backlog++;
1408                 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1409         }
1410 
1411         /*
1412          *      Now tell the user we may have some data. 
1413          */
1414          
1415         if (!sk->dead) 
1416         {
1417                 if(sk->debug)
1418                         printk("Data wakeup.\n");
1419                 sk->data_ready(sk,0);
1420         } 
1421         return(0);
1422 }
1423 
1424 
1425 /*
1426  *      This routine is only called when we have urgent data
1427  *      signalled. Its the 'slow' part of tcp_urg. It could be
1428  *      moved inline now as tcp_urg is only called from one
1429  *      place. We handle URGent data wrong. We have to - as
1430  *      BSD still doesn't use the correction from RFC961.
1431  */
1432  
1433 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
1434 {
1435         u32 ptr = ntohs(th->urg_ptr);
1436 
1437         if (ptr)
1438                 ptr--;
1439         ptr += ntohl(th->seq);
1440 
1441         /* ignore urgent data that we've already seen and read */
1442         if (after(sk->copied_seq, ptr))
1443                 return;
1444 
1445         /* do we already have a newer (or duplicate) urgent pointer? */
1446         if (sk->urg_data && !after(ptr, sk->urg_seq))
1447                 return;
1448 
1449         /* tell the world about our new urgent pointer */
1450         if (sk->proc != 0) {
1451                 if (sk->proc > 0) {
1452                         kill_proc(sk->proc, SIGURG, 1);
1453                 } else {
1454                         kill_pg(-sk->proc, SIGURG, 1);
1455                 }
1456         }
1457         sk->urg_data = URG_NOTYET;
1458         sk->urg_seq = ptr;
1459 }
1460 
1461 /*
1462  *      This is the 'fast' part of urgent handling.
1463  */
1464  
1465 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
     /*  */
1466 {
1467         /*
1468          *      Check if we get a new urgent pointer - normally not 
1469          */
1470          
1471         if (th->urg)
1472                 tcp_check_urg(sk,th);
1473 
1474         /*
1475          *      Do we wait for any urgent data? - normally not
1476          */
1477          
1478         if (sk->urg_data == URG_NOTYET) {
1479                 u32 ptr;
1480 
1481                 /*
1482                  *      Is the urgent pointer pointing into this packet? 
1483                  */      
1484                 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1485                 if (ptr < len) {
1486                         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1487                         if (!sk->dead)
1488                                 sk->data_ready(sk,0);
1489                 }
1490         }
1491 }
1492 
1493 
1494 /*
1495  *      A TCP packet has arrived.
1496  *              skb->h.raw is the TCP header.
1497  */
1498  
1499 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
1500         __u32 daddr, unsigned short len,
1501         __u32 saddr, int redo, struct inet_protocol * protocol)
1502 {
1503         struct tcphdr *th;
1504         struct sock *sk;
1505         int syn_ok=0;
1506 
1507         /*
1508          * "redo" is 1 if we have already seen this skb but couldn't
1509          * use it at that time (the socket was locked).  In that case
1510          * we have already done a lot of the work (looked up the socket
1511          * etc).
1512          */
1513         th = skb->h.th;
1514         sk = skb->sk;
1515         if (!redo) {
1516                 tcp_statistics.TcpInSegs++;
1517                 if (skb->pkt_type!=PACKET_HOST)
1518                         goto discard_it;
1519 
1520                 /*
1521                  *      Pull up the IP header.
1522                  */
1523         
1524                 skb_pull(skb, skb->h.raw-skb->data);
1525 
1526                 /*
1527                  *      Try to use the device checksum if provided.
1528                  */
1529                 switch (skb->ip_summed) 
1530                 {
1531                         case CHECKSUM_NONE:
1532                                 skb->csum = csum_partial((char *)th, len, 0);
1533                         case CHECKSUM_HW:
1534                                 if (tcp_check(th, len, saddr, daddr, skb->csum))
1535                                         goto discard_it;
1536                         default:
1537                                 /* CHECKSUM_UNNECESSARY */
1538                 }
1539                 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1540                 if (!sk)
1541                         goto no_tcp_socket;
1542                 skb->sk = sk;
1543                 skb->seq = ntohl(th->seq);
1544                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1545                 skb->ack_seq = ntohl(th->ack_seq);
1546 
1547                 skb->acked = 0;
1548                 skb->used = 0;
1549                 skb->free = 1;
1550                 skb->saddr = daddr;
1551                 skb->daddr = saddr;
1552 
1553                 /*
1554                  * We may need to add it to the backlog here. 
1555                  */
1556                 if (sk->users) 
1557                 {
1558                         __skb_queue_tail(&sk->back_log, skb);
1559                         return(0);
1560                 }
1561         }
1562 
1563         /*
1564          *      If this socket has got a reset it's to all intents and purposes 
1565          *      really dead. Count closed sockets as dead.
1566          *
1567          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1568          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
1569          *      exist so should cause resets as if the port was unreachable.
1570          */
1571 
1572         if (sk->zapped || sk->state==TCP_CLOSE)
1573                 goto no_tcp_socket;
1574 
1575         if (!sk->prot) 
1576         {
1577                 printk("IMPOSSIBLE 3\n");
1578                 return(0);
1579         }
1580 
1581 
1582         /*
1583          *      Charge the memory to the socket. 
1584          */
1585          
1586         skb->sk=sk;
1587         sk->rmem_alloc += skb->truesize;
1588         
1589         /*
1590          *      We should now do header prediction.
1591          */
1592          
1593         /*
1594          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1595          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1596          *      compatibility. We also set up variables more thoroughly [Karn notes in the
1597          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1598          */
1599 
1600         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
1601         {
1602         
1603                 /*
1604                  *      Now deal with unusual cases.
1605                  */
1606          
1607                 if(sk->state==TCP_LISTEN)
1608                 {
1609                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
1610                                 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1611 
1612                         /*
1613                          *      We don't care for RST, and non SYN are absorbed (old segments)
1614                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1615                          *      netmask on a running connection it can go broadcast. Even Sun's have
1616                          *      this problem so I'm ignoring it 
1617                          */
1618                            
1619                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1620                         {
1621                                 kfree_skb(skb, FREE_READ);
1622                                 return 0;
1623                         }
1624                 
1625                         /*      
1626                          *      Guess we need to make a new socket up 
1627                          */
1628                 
1629                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1630                 
1631                         /*
1632                          *      Now we have several options: In theory there is nothing else
1633                          *      in the frame. KA9Q has an option to send data with the syn,
1634                          *      BSD accepts data with the syn up to the [to be] advertised window
1635                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
1636                          *      it, that fits the spec precisely and avoids incompatibilities. It
1637                          *      would be nice in future to drop through and process the data.
1638                          *
1639                          *      Now TTCP is starting to use we ought to queue this data.
1640                          */
1641                          
1642                         return 0;
1643                 }
1644         
1645                 /* 
1646                  *      Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1647                  *      then its a new connection
1648                  */
1649                  
1650                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1651                 {
1652                         kfree_skb(skb, FREE_READ);
1653                         return 0;
1654                 }
1655                 
1656                 /*
1657                  *      SYN sent means we have to look for a suitable ack and either reset
1658                  *      for bad matches or go to connected. The SYN_SENT case is unusual and should
1659                  *      not be in line code. [AC]
1660                  */
1661            
1662                 if(sk->state==TCP_SYN_SENT)
1663                 {
1664                         /* Crossed SYN or previous junk segment */
1665                         if(th->ack)
1666                         {
1667                                 /* We got an ack, but it's not a good ack */
1668                                 if(!tcp_ack(sk,th,skb->ack_seq,len))
1669                                 {
1670                                         /* Reset the ack - its an ack from a 
1671                                            different connection  [ th->rst is checked in tcp_send_reset()] */
1672                                         tcp_statistics.TcpAttemptFails++;
1673                                         tcp_send_reset(daddr, saddr, th,
1674                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1675                                         kfree_skb(skb, FREE_READ);
1676                                         return(0);
1677                                 }
1678                                 if(th->rst)
1679                                         return tcp_reset(sk,skb);
1680                                 if(!th->syn)
1681                                 {
1682                                         /* A valid ack from a different connection
1683                                            start. Shouldn't happen but cover it */
1684                                         tcp_statistics.TcpAttemptFails++;
1685                                         tcp_send_reset(daddr, saddr, th,
1686                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1687                                         kfree_skb(skb, FREE_READ);
1688                                         return 0;
1689                                 }
1690                                 /*
1691                                  *      Ok.. it's good. Set up sequence numbers and
1692                                  *      move to established.
1693                                  */
1694                                 syn_ok=1;       /* Don't reset this connection for the syn */
1695                                 sk->acked_seq = skb->seq+1;
1696                                 sk->lastwin_seq = skb->seq+1;
1697                                 sk->fin_seq = skb->seq;
1698                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1699                                 tcp_set_state(sk, TCP_ESTABLISHED);
1700                                 tcp_options(sk,th);
1701                                 sk->dummy_th.dest=th->source;
1702                                 sk->copied_seq = sk->acked_seq;
1703                                 if(!sk->dead)
1704                                 {
1705                                         sk->state_change(sk);
1706                                         sock_wake_async(sk->socket, 0);
1707                                 }
1708                                 if(sk->max_window==0)
1709                                 {
1710                                         sk->max_window = 32;
1711                                         sk->mss = min(sk->max_window, sk->mtu);
1712                                 }
1713                         }
1714                         else
1715                         {
1716                                 /* See if SYN's cross. Drop if boring */
1717                                 if(th->syn && !th->rst)
1718                                 {
1719                                         /* Crossed SYN's are fine - but talking to
1720                                            yourself is right out... */
1721                                         if(sk->saddr==saddr && sk->daddr==daddr &&
1722                                                 sk->dummy_th.source==th->source &&
1723                                                 sk->dummy_th.dest==th->dest)
1724                                         {
1725                                                 tcp_statistics.TcpAttemptFails++;
1726                                                 return tcp_reset(sk,skb);
1727                                         }
1728                                         tcp_set_state(sk,TCP_SYN_RECV);
1729                                         
1730                                         /*
1731                                          *      FIXME:
1732                                          *      Must send SYN|ACK here
1733                                          */
1734                                 }               
1735                                 /* Discard junk segment */
1736                                 kfree_skb(skb, FREE_READ);
1737                                 return 0;
1738                         }
1739                         /*
1740                          *      SYN_RECV with data maybe.. drop through
1741                          */
1742                         goto rfc_step6;
1743                 }
1744 
1745         /*
1746          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1747          *      a more complex suggestion for fixing these reuse issues in RFC1644
1748          *      but not yet ready for general use. Also see RFC1379.
1749          *
1750          *      Note the funny way we go back to the top of this function for
1751          *      this case ("goto try_next_socket").  That also takes care of
1752          *      checking "sk->users" for the new socket as well as doing all
1753          *      the normal tests on the packet.
1754          */
1755         
1756 #define BSD_TIME_WAIT
1757 #ifdef BSD_TIME_WAIT
1758                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
1759                         after(skb->seq, sk->acked_seq) && !th->rst)
1760                 {
1761                         u32 seq = sk->write_seq;
1762                         if(sk->debug)
1763                                 printk("Doing a BSD time wait\n");
1764                         tcp_statistics.TcpEstabResets++;           
1765                         sk->rmem_alloc -= skb->truesize;
1766                         skb->sk = NULL;
1767                         sk->err=ECONNRESET;
1768                         tcp_set_state(sk, TCP_CLOSE);
1769                         sk->shutdown = SHUTDOWN_MASK;
1770                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1771                         /* this is not really correct: we should check sk->users */
1772                         if (sk && sk->state==TCP_LISTEN)
1773                         {
1774                                 skb->sk = sk;
1775                                 sk->rmem_alloc += skb->truesize;
1776                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1777                                 return 0;
1778                         }
1779                         kfree_skb(skb, FREE_READ);
1780                         return 0;
1781                 }
1782 #endif  
1783         }
1784 
1785         /*
1786          *      We are now in normal data flow (see the step list in the RFC)
1787          *      Note most of these are inline now. I'll inline the lot when
1788          *      I have time to test it hard and look at what gcc outputs 
1789          */
1790         
1791         if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1792         {
1793                 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1794                 kfree_skb(skb, FREE_READ);
1795                 return 0;
1796         }
1797 
1798         if(th->rst)
1799                 return tcp_reset(sk,skb);
1800         
1801         /*
1802          *      !syn_ok is effectively the state test in RFC793.
1803          */
1804          
1805         if(th->syn && !syn_ok)
1806         {
1807                 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1808                 return tcp_reset(sk,skb);       
1809         }
1810 
1811         tcp_delack_estimator(sk);
1812         
1813         /*
1814          *      Process the ACK
1815          */
1816          
1817 
1818         if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1819         {
1820                 /*
1821                  *      Our three way handshake failed.
1822                  */
1823                  
1824                 if(sk->state==TCP_SYN_RECV)
1825                 {
1826                         tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1827                 }
1828                 kfree_skb(skb, FREE_READ);
1829                 return 0;
1830         }
1831         
1832 rfc_step6:              /* I'll clean this up later */
1833 
1834         /*
1835          *      If the accepted buffer put us over our queue size we
1836          *      now drop it (we must process the ack first to avoid
1837          *      deadlock cases).
1838          */
1839          
1840         if (sk->rmem_alloc  >= sk->rcvbuf) 
1841         {
1842                 kfree_skb(skb, FREE_READ);
1843                 return(0);
1844         }
1845 
1846 
1847         /*
1848          *      Process urgent data
1849          */
1850                 
1851         tcp_urg(sk, th, len);
1852         
1853         /*
1854          *      Process the encapsulated data
1855          */
1856         
1857         if(tcp_data(skb,sk, saddr, len))
1858                 kfree_skb(skb, FREE_READ);
1859 
1860         /*
1861          *      And done
1862          */     
1863         
1864         return 0;
1865 
1866 no_tcp_socket:
1867         /*
1868          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1869          */
1870         tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1871 
1872 discard_it:
1873         /*
1874          *      Discard frame
1875          */
1876         skb->sk = NULL;
1877         kfree_skb(skb, FREE_READ);
1878         return 0;
1879 }
/* */
root/net/ipv4/tcp_input.c

DEFINITIONS