net/ipv4/tcp

/* */
This source file includes following definitions.
tcp_delack_estimator
tcp_rtt_estimator
tcp_cache_zap
get_tcp_sock
bad_tcp_sequence
tcp_sequence
tcp_reset
tcp_options
tcp_conn_request
tcp_window_shrunk
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_rcv
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp_input.c 1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * FIXES
  23  *              Pedro Roque     :       Double ACK bug
  24  */
  25 
  26 #include <linux/config.h>
  27 #include <net/tcp.h>
  28 
  29 /*
  30  *      Policy code extracted so its now seperate
  31  */
  32 
  33 /*
  34  *      Called each time to estimate the delayed ack timeout. This is
  35  *      how it should be done so a fast link isnt impacted by ack delay.
  36  */
  37  
  38 extern __inline__ void tcp_delack_estimator(struct sock *sk)
     /*  */
  39 {
  40         /*
  41          *      Delayed ACK time estimator.
  42          */
  43         
  44         if (sk->lrcvtime == 0) 
  45         {
  46                 sk->lrcvtime = jiffies;
  47                 sk->ato = HZ/3;
  48         }
  49         else 
  50         {
  51                 int m;
  52                 
  53                 m = jiffies - sk->lrcvtime;
  54 
  55                 sk->lrcvtime = jiffies;
  56 
  57                 if (m <= 0)
  58                         m = 1;
  59 
  60                 if (m > (sk->rtt >> 3)) 
  61                 {
  62                         sk->ato = sk->rtt >> 3;
  63                         /*
  64                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
  65                          */
  66                 }
  67                 else 
  68                 {
  69                         sk->ato = (sk->ato >> 1) + m;
  70                         /*
  71                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
  72                          */
  73                 }
  74         }
  75 }
  76 
  77 /*
  78  *      Called on frames that were known _not_ to have been
  79  *      retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
  80  *      The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
  81  */
  82  
  83 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
     /*  */
  84 {
  85         long m;
  86         /*
  87          *      The following amusing code comes from Jacobson's
  88          *      article in SIGCOMM '88.  Note that rtt and mdev
  89          *      are scaled versions of rtt and mean deviation.
  90          *      This is designed to be as fast as possible 
  91          *      m stands for "measurement".
  92          */
  93         
  94         m = jiffies - oskb->when;  /* RTT */
  95         if(m<=0)
  96                 m=1;            /* IS THIS RIGHT FOR <0 ??? */
  97         m -= (sk->rtt >> 3);    /* m is now error in rtt est */
  98         sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
  99         if (m < 0)
 100                 m = -m;         /* m is now abs(error) */
 101         m -= (sk->mdev >> 2);   /* similar update on mdev */
 102         sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 103 
 104         /*
 105          *      Now update timeout.  Note that this removes any backoff.
 106          */
 107                          
 108         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 109         if (sk->rto > 120*HZ)
 110                 sk->rto = 120*HZ;
 111         if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
 112                 sk->rto = HZ/5;
 113         sk->backoff = 0;
 114 }
 115 
 116 /*
 117  *      Cached last hit socket
 118  */
 119  
 120 static volatile unsigned long   th_cache_saddr, th_cache_daddr;
 121 static volatile unsigned short  th_cache_dport, th_cache_sport;
 122 static volatile struct sock *th_cache_sk;
 123 
 124 void tcp_cache_zap(void)
     /*  */
 125 {
 126         th_cache_sk=NULL;
 127 }
 128 
 129 /*
 130  *      Find the socket, using the last hit cache if applicable. The cache is not quite
 131  *      right...
 132  */
 133 
 134 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
     /*  */
 135 {
 136         struct sock * sk;
 137 
 138         sk = (struct sock *) th_cache_sk;
 139         if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
 140             sport != th_cache_sport || dport != th_cache_dport) {
 141                 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
 142                 if (sk) {
 143                         th_cache_saddr=saddr;
 144                         th_cache_daddr=daddr;
 145                         th_cache_dport=dport;
 146                         th_cache_sport=sport;
 147                         th_cache_sk=sk;
 148                 }
 149         }
 150         return sk;
 151 }
 152 
 153 /*
 154  * React to a out-of-window TCP sequence number in an incoming packet
 155  */
 156  
 157 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
 158              struct options *opt, unsigned long saddr, struct device *dev)
 159 {
 160         if (th->rst)
 161                 return;
 162 
 163         /*
 164          *      Send a reset if we get something not ours and we are
 165          *      unsynchronized. Note: We don't do anything to our end. We
 166          *      are just killing the bogus remote connection then we will
 167          *      connect again and it will work (with luck).
 168          */
 169          
 170         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
 171         {
 172                 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
 173                 return;
 174         }
 175         
 176         /*
 177          *      4.3reno machines look for these kind of acks so they can do fast
 178          *      recovery. Three identical 'old' acks lets it know that one frame has
 179          *      been lost and should be resent. Because this is before the whole window
 180          *      of data has timed out it can take one lost frame per window without
 181          *      stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
 182          *
 183          *      We also should be spotting triple bad sequences.
 184          */
 185         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
 186         return;
 187 }
 188 
 189 /*
 190  *      This functions checks to see if the tcp header is actually acceptable. 
 191  */
 192  
 193 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
     /*  */
 194 {
 195         u32 end_window = sk->acked_seq + sk->window;
 196         return  /* if start is at end of window, end must be too (zero window) */
 197                 (seq == end_window && seq == end_seq) ||
 198                 /* if start is before end of window, check for interest */
 199                 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
 200 }
 201 
 202 /*
 203  *      When we get a reset we do this. This probably is a tcp_output routine
 204  *      really.
 205  */
 206 
 207 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
 208 {
 209         sk->zapped = 1;
 210         /*
 211          *      We want the right error as BSD sees it (and indeed as we do).
 212          */
 213         sk->err = ECONNRESET;
 214         if (sk->state == TCP_SYN_SENT)
 215                 sk->err = ECONNREFUSED;
 216         if (sk->state == TCP_CLOSE_WAIT)
 217                 sk->err = EPIPE;
 218 #ifdef CONFIG_TCP_RFC1337
 219         /*
 220          *      Time wait assassination protection [RFC1337]
 221          *
 222          *      This is a good idea, but causes more sockets to take time to close.
 223          *
 224          *      Ian Heavens has since shown this is an inadequate fix for the protocol
 225          *      bug in question.
 226          */
 227         if(sk->state!=TCP_TIME_WAIT)
 228         {       
 229                 tcp_set_state(sk,TCP_CLOSE);
 230                 sk->shutdown = SHUTDOWN_MASK;
 231         }
 232 #else   
 233         tcp_set_state(sk,TCP_CLOSE);
 234         sk->shutdown = SHUTDOWN_MASK;
 235 #endif  
 236         if (!sk->dead) 
 237                 sk->state_change(sk);
 238         kfree_skb(skb, FREE_READ);
 239         return(0);
 240 }
 241 
 242 
 243 /*
 244  *      Look for tcp options. Parses everything but only knows about MSS.
 245  *      This routine is always called with the packet containing the SYN.
 246  *      However it may also be called with the ack to the SYN.  So you
 247  *      can't assume this is always the SYN.  It's always called after
 248  *      we have set up sk->mtu to our own MTU.
 249  *
 250  *      We need at minimum to add PAWS support here. Possibly large windows
 251  *      as Linux gets deployed on 100Mb/sec networks.
 252  */
 253  
 254 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
 255 {
 256         unsigned char *ptr;
 257         int length=(th->doff*4)-sizeof(struct tcphdr);
 258         int mss_seen = 0;
 259     
 260         ptr = (unsigned char *)(th + 1);
 261   
 262         while(length>0)
 263         {
 264                 int opcode=*ptr++;
 265                 int opsize=*ptr++;
 266                 switch(opcode)
 267                 {
 268                         case TCPOPT_EOL:
 269                                 return;
 270                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 271                                 length--;
 272                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
 273                                 continue;
 274                         
 275                         default:
 276                                 if(opsize<=2)   /* Avoid silly options looping forever */
 277                                         return;
 278                                 switch(opcode)
 279                                 {
 280                                         case TCPOPT_MSS:
 281                                                 if(opsize==4 && th->syn)
 282                                                 {
 283                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
 284                                                         mss_seen = 1;
 285                                                 }
 286                                                 break;
 287                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
 288                                 }
 289                                 ptr+=opsize-2;
 290                                 length-=opsize;
 291                 }
 292         }
 293         if (th->syn) 
 294         {
 295                 if (! mss_seen)
 296                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
 297         }
 298 #ifdef CONFIG_INET_PCTCP
 299         sk->mss = min(sk->max_window >> 1, sk->mtu);
 300 #else    
 301         sk->mss = min(sk->max_window, sk->mtu);
 302         sk->max_unacked = 2 * sk->mss;
 303 #endif  
 304 }
 305 
 306 
 307 /*
 308  *      This routine handles a connection request.
 309  *      It should make sure we haven't already responded.
 310  *      Because of the way BSD works, we have to send a syn/ack now.
 311  *      This also means it will be harder to close a socket which is
 312  *      listening.
 313  */
 314  
 315 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
 316                  u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
 317 {
 318         struct sock *newsk;
 319         struct tcphdr *th;
 320         struct rtable *rt;
 321   
 322         th = skb->h.th;
 323 
 324         /* If the socket is dead, don't accept the connection. */
 325         if (!sk->dead) 
 326         {
 327                 sk->data_ready(sk,0);
 328         }
 329         else 
 330         {
 331                 if(sk->debug)
 332                         printk("Reset on %p: Connect on dead socket.\n",sk);
 333                 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
 334                 tcp_statistics.TcpAttemptFails++;
 335                 kfree_skb(skb, FREE_READ);
 336                 return;
 337         }
 338 
 339         /*
 340          *      Make sure we can accept more.  This will prevent a
 341          *      flurry of syns from eating up all our memory.
 342          *
 343          *      BSD does some funnies here and allows 3/2 times the
 344          *      set backlog as a fudge factor. Thats just too gross.
 345          */
 346 
 347         if (sk->ack_backlog >= sk->max_ack_backlog) 
 348         {
 349                 tcp_statistics.TcpAttemptFails++;
 350                 kfree_skb(skb, FREE_READ);
 351                 return;
 352         }
 353 
 354         /*
 355          * We need to build a new sock struct.
 356          * It is sort of bad to have a socket without an inode attached
 357          * to it, but the wake_up's will just wake up the listening socket,
 358          * and if the listening socket is destroyed before this is taken
 359          * off of the queue, this will take care of it.
 360          */
 361 
 362         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
 363         if (newsk == NULL) 
 364         {
 365                 /* just ignore the syn.  It will get retransmitted. */
 366                 tcp_statistics.TcpAttemptFails++;
 367                 kfree_skb(skb, FREE_READ);
 368                 return;
 369         }
 370 
 371         memcpy(newsk, sk, sizeof(*newsk));
 372         newsk->opt = NULL;
 373         newsk->ip_route_cache  = NULL;
 374         if (opt && opt->optlen) 
 375         {
 376                 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
 377                 if (!sk->opt) 
 378                 {
 379                         kfree_s(newsk, sizeof(struct sock));
 380                         tcp_statistics.TcpAttemptFails++;
 381                         kfree_skb(skb, FREE_READ);
 382                         return;
 383                 }
 384                 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
 385                 {
 386                         kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
 387                         kfree_s(newsk, sizeof(struct sock));
 388                         tcp_statistics.TcpAttemptFails++;
 389                         kfree_skb(skb, FREE_READ);
 390                         return;
 391                 }
 392         }
 393         skb_queue_head_init(&newsk->write_queue);
 394         skb_queue_head_init(&newsk->receive_queue);
 395         newsk->send_head = NULL;
 396         newsk->send_tail = NULL;
 397         skb_queue_head_init(&newsk->back_log);
 398         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
 399         newsk->rto = TCP_TIMEOUT_INIT;
 400         newsk->mdev = 0;
 401         newsk->max_window = 0;
 402         newsk->cong_window = 1;
 403         newsk->cong_count = 0;
 404         newsk->ssthresh = 0;
 405         newsk->backoff = 0;
 406         newsk->blog = 0;
 407         newsk->intr = 0;
 408         newsk->proc = 0;
 409         newsk->done = 0;
 410         newsk->partial = NULL;
 411         newsk->pair = NULL;
 412         newsk->wmem_alloc = 0;
 413         newsk->rmem_alloc = 0;
 414         newsk->localroute = sk->localroute;
 415 
 416         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
 417 
 418         newsk->err = 0;
 419         newsk->shutdown = 0;
 420         newsk->ack_backlog = 0;
 421         newsk->acked_seq = skb->seq+1;
 422         newsk->lastwin_seq = skb->seq+1;
 423         newsk->delay_acks = 1;
 424         newsk->copied_seq = skb->seq+1;
 425         newsk->fin_seq = skb->seq;
 426         newsk->state = TCP_SYN_RECV;
 427         newsk->timeout = 0;
 428         newsk->ip_xmit_timeout = 0;
 429         newsk->write_seq = seq; 
 430         newsk->window_seq = newsk->write_seq;
 431         newsk->rcv_ack_seq = newsk->write_seq;
 432         newsk->urg_data = 0;
 433         newsk->retransmits = 0;
 434         newsk->linger=0;
 435         newsk->destroy = 0;
 436         init_timer(&newsk->timer);
 437         newsk->timer.data = (unsigned long)newsk;
 438         newsk->timer.function = &net_timer;
 439         init_timer(&newsk->retransmit_timer);
 440         newsk->retransmit_timer.data = (unsigned long)newsk;
 441         newsk->retransmit_timer.function=&tcp_retransmit_timer;
 442         newsk->dummy_th.source = skb->h.th->dest;
 443         newsk->dummy_th.dest = skb->h.th->source;
 444         
 445         /*
 446          *      Swap these two, they are from our point of view. 
 447          */
 448          
 449         newsk->daddr = saddr;
 450         newsk->saddr = daddr;
 451         newsk->rcv_saddr = daddr;
 452 
 453         put_sock(newsk->num,newsk);
 454         newsk->acked_seq = skb->seq + 1;
 455         newsk->copied_seq = skb->seq + 1;
 456         newsk->socket = NULL;
 457 
 458         /*
 459          *      Grab the ttl and tos values and use them 
 460          */
 461 
 462         newsk->ip_ttl=sk->ip_ttl;
 463         newsk->ip_tos=skb->ip_hdr->tos;
 464 
 465         /*
 466          *      Use 512 or whatever user asked for 
 467          */
 468 
 469         /*
 470          *      Note use of sk->user_mss, since user has no direct access to newsk 
 471          */
 472 
 473         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
 474         newsk->ip_route_cache = rt;
 475         
 476         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
 477                 newsk->window_clamp = rt->rt_window;
 478         else
 479                 newsk->window_clamp = 0;
 480                 
 481         if (sk->user_mss)
 482                 newsk->mtu = sk->user_mss;
 483         else if (rt)
 484                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 485         else 
 486                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
 487 
 488         /*
 489          *      But not bigger than device MTU 
 490          */
 491 
 492         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
 493 
 494 #ifdef CONFIG_SKIP
 495         
 496         /*
 497          *      SKIP devices set their MTU to 65535. This is so they can take packets
 498          *      unfragmented to security process then fragment. They could lie to the
 499          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
 500          *      simply because the final package we want unfragmented is going to be
 501          *
 502          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
 503          */
 504          
 505         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
 506                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
 507 #endif
 508         /*
 509          *      This will min with what arrived in the packet 
 510          */
 511 
 512         tcp_options(newsk,skb->h.th);
 513         
 514         tcp_cache_zap();
 515         tcp_send_synack(newsk, sk, skb);
 516 }
 517 
 518 
 519 /*
 520  * Handle a TCP window that shrunk on us. It shouldn't happen,
 521  * but..
 522  *
 523  * We may need to move packets from the send queue
 524  * to the write queue, if the window has been shrunk on us.
 525  * The RFC says you are not allowed to shrink your window
 526  * like this, but if the other end does, you must be able
 527  * to deal with it.
 528  */
 529 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
     /*  */
 530 {
 531         struct sk_buff *skb;
 532         struct sk_buff *skb2;
 533         struct sk_buff *wskb = NULL;
 534         
 535         skb2 = sk->send_head;
 536         sk->send_head = NULL;
 537         sk->send_tail = NULL;
 538 
 539         /*
 540          *      This is an artifact of a flawed concept. We want one
 541          *      queue and a smarter send routine when we send all.
 542          */
 543         cli();
 544         while (skb2 != NULL) 
 545         {
 546                 skb = skb2;
 547                 skb2 = skb->link3;
 548                 skb->link3 = NULL;
 549                 if (after(skb->end_seq, window_seq)) 
 550                 {
 551                         if (sk->packets_out > 0) 
 552                                 sk->packets_out--;
 553                         /* We may need to remove this from the dev send list. */
 554                         if (skb->next != NULL) 
 555                         {
 556                                 skb_unlink(skb);                                
 557                         }
 558                         /* Now add it to the write_queue. */
 559                         if (wskb == NULL)
 560                                 skb_queue_head(&sk->write_queue,skb);
 561                         else
 562                                 skb_append(wskb,skb);
 563                         wskb = skb;
 564                 } 
 565                 else 
 566                 {
 567                         if (sk->send_head == NULL) 
 568                         {
 569                                 sk->send_head = skb;
 570                                 sk->send_tail = skb;
 571                         }
 572                         else
 573                         {
 574                                 sk->send_tail->link3 = skb;
 575                                 sk->send_tail = skb;
 576                         }
 577                         skb->link3 = NULL;
 578                 }
 579         }
 580         sti();
 581 }
 582 
 583 
 584 /*
 585  *      This routine deals with incoming acks, but not outgoing ones.
 586  *
 587  *      This routine is totally _WRONG_. The list structuring is wrong,
 588  *      the algorithm is wrong, the code is wrong.
 589  */
 590 
 591 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
     /*  */
 592 {
 593         int flag = 0;
 594         u32 window_seq;
 595 
 596         /* 
 597          * 1 - there was data in packet as well as ack or new data is sent or 
 598          *     in shutdown state
 599          * 2 - data from retransmit queue was acked and removed
 600          * 4 - window shrunk or data from retransmit queue was acked and removed
 601          */
 602 
 603         if(sk->zapped)
 604                 return(1);      /* Dead, cant ack any more so why bother */
 605 
 606         /*
 607          *      We have dropped back to keepalive timeouts. Thus we have
 608          *      no retransmits pending.
 609          */
 610          
 611         if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
 612                 sk->retransmits = 0;
 613 
 614         /*
 615          *      If the ack is newer than sent or older than previous acks
 616          *      then we can probably ignore it.
 617          */
 618          
 619         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
 620                 goto uninteresting_ack;
 621 
 622         /*
 623          *      If there is data set flag 1
 624          */
 625          
 626         if (len != th->doff*4) 
 627                 flag |= 1;
 628 
 629         /*
 630          *      Have we discovered a larger window
 631          */
 632         window_seq = ntohs(th->window);
 633         if (window_seq > sk->max_window) 
 634         {
 635                 sk->max_window = window_seq;
 636 #ifdef CONFIG_INET_PCTCP
 637                 /* Hack because we don't send partial packets to non SWS
 638                    handling hosts */
 639                 sk->mss = min(window_seq>>1, sk->mtu);
 640 #else
 641                 sk->mss = min(window_seq, sk->mtu);
 642 #endif  
 643         }
 644         window_seq += ack;
 645 
 646         /*
 647          *      See if our window has been shrunk. 
 648          */
 649         if (after(sk->window_seq, window_seq)) {
 650                 flag |= 4;
 651                 tcp_window_shrunk(sk, window_seq);
 652         }
 653 
 654         /*
 655          *      Update the right hand window edge of the host
 656          */
 657         sk->window_seq = window_seq;
 658 
 659         /*
 660          *      Pipe has emptied
 661          */      
 662         if (sk->send_tail == NULL || sk->send_head == NULL) 
 663         {
 664                 sk->send_head = NULL;
 665                 sk->send_tail = NULL;
 666                 sk->packets_out= 0;
 667         }
 668 
 669         /*
 670          *      We don't want too many packets out there. 
 671          */
 672          
 673         if (sk->ip_xmit_timeout == TIME_WRITE && 
 674                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
 675         {
 676                 
 677                 /* 
 678                  * This is Jacobson's slow start and congestion avoidance. 
 679                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
 680                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
 681                  * counter and increment it once every cwnd times.  It's possible
 682                  * that this should be done only if sk->retransmits == 0.  I'm
 683                  * interpreting "new data is acked" as including data that has
 684                  * been retransmitted but is just now being acked.
 685                  */
 686                 if (sk->cong_window < sk->ssthresh)  
 687                         /* 
 688                          *      In "safe" area, increase
 689                          */
 690                         sk->cong_window++;
 691                 else 
 692                 {
 693                         /*
 694                          *      In dangerous area, increase slowly.  In theory this is
 695                          *      sk->cong_window += 1 / sk->cong_window
 696                          */
 697                         if (sk->cong_count >= sk->cong_window) 
 698                         {
 699                                 sk->cong_window++;
 700                                 sk->cong_count = 0;
 701                         }
 702                         else 
 703                                 sk->cong_count++;
 704                 }
 705         }
 706 
 707         /*
 708          *      Remember the highest ack received.
 709          */
 710          
 711         sk->rcv_ack_seq = ack;
 712         
 713         /*
 714          *      We passed data and got it acked, remove any soft error
 715          *      log. Something worked...
 716          */
 717          
 718         sk->err_soft = 0;
 719 
 720         /*
 721          *      If this ack opens up a zero window, clear backoff.  It was
 722          *      being used to time the probes, and is probably far higher than
 723          *      it needs to be for normal retransmission.
 724          */
 725 
 726         if (sk->ip_xmit_timeout == TIME_PROBE0) 
 727         {
 728                 sk->retransmits = 0;    /* Our probe was answered */
 729                 
 730                 /*
 731                  *      Was it a usable window open ?
 732                  */
 733                  
 734                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
 735                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
 736                 {
 737                         sk->backoff = 0;
 738                         
 739                         /*
 740                          *      Recompute rto from rtt.  this eliminates any backoff.
 741                          */
 742 
 743                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 744                         if (sk->rto > 120*HZ)
 745                                 sk->rto = 120*HZ;
 746                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
 747                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
 748                                                    .2 of a second is going to need huge windows (SIGH) */
 749                         sk->rto = HZ/5;
 750                 }
 751         }
 752 
 753         /* 
 754          *      See if we can take anything off of the retransmit queue.
 755          */
 756    
 757         while(sk->send_head != NULL) 
 758         {
 759                 /* Check for a bug. */
 760                 if (sk->send_head->link3 &&
 761                     after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) 
 762                         printk("INET: tcp.c: *** bug send_list out of order.\n");
 763                         
 764                 /*
 765                  *      If our packet is before the ack sequence we can
 766                  *      discard it as it's confirmed to have arrived the other end.
 767                  */
 768                  
 769                 if (before(sk->send_head->end_seq, ack+1)) 
 770                 {
 771                         struct sk_buff *oskb;   
 772                         if (sk->retransmits) 
 773                         {       
 774                                 /*
 775                                  *      We were retransmitting.  don't count this in RTT est 
 776                                  */
 777                                 flag |= 2;
 778 
 779                                 /*
 780                                  * even though we've gotten an ack, we're still
 781                                  * retransmitting as long as we're sending from
 782                                  * the retransmit queue.  Keeping retransmits non-zero
 783                                  * prevents us from getting new data interspersed with
 784                                  * retransmissions.
 785                                  */
 786 
 787                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
 788                                         sk->retransmits = 1;
 789                                 else
 790                                         sk->retransmits = 0;
 791                         }
 792                         /*
 793                          * Note that we only reset backoff and rto in the
 794                          * rtt recomputation code.  And that doesn't happen
 795                          * if there were retransmissions in effect.  So the
 796                          * first new packet after the retransmissions is
 797                          * sent with the backoff still in effect.  Not until
 798                          * we get an ack from a non-retransmitted packet do
 799                          * we reset the backoff and rto.  This allows us to deal
 800                          * with a situation where the network delay has increased
 801                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 802                          */
 803 
 804                         /*
 805                          *      We have one less packet out there. 
 806                          */
 807                          
 808                         if (sk->packets_out > 0) 
 809                                 sk->packets_out --;
 810 
 811                         oskb = sk->send_head;
 812 
 813                         if (!(flag&2))  /* Not retransmitting */
 814                                 tcp_rtt_estimator(sk,oskb);
 815                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
 816                                            In this case as we just set it up */
 817                         cli();
 818                         oskb = sk->send_head;
 819                         IS_SKB(oskb);
 820                         sk->send_head = oskb->link3;
 821                         if (sk->send_head == NULL) 
 822                         {
 823                                 sk->send_tail = NULL;
 824                         }
 825 
 826                 /*
 827                  *      We may need to remove this from the dev send list. 
 828                  */
 829 
 830                         if (oskb->next)
 831                                 skb_unlink(oskb);
 832                         sti();
 833                         kfree_skb(oskb, FREE_WRITE); /* write. */
 834                         if (!sk->dead)
 835                                 sk->write_space(sk);
 836                 }
 837                 else
 838                 {
 839                         break;
 840                 }
 841         }
 842 
 843         /*
 844          * XXX someone ought to look at this too.. at the moment, if skb_peek()
 845          * returns non-NULL, we complete ignore the timer stuff in the else
 846          * clause.  We ought to organize the code so that else clause can
 847          * (should) be executed regardless, possibly moving the PROBE timer
 848          * reset over.  The skb_peek() thing should only move stuff to the
 849          * write queue, NOT also manage the timer functions.
 850          */
 851 
 852         /*
 853          * Maybe we can take some stuff off of the write queue,
 854          * and put it onto the xmit queue.
 855          */
 856         if (skb_peek(&sk->write_queue) != NULL) 
 857         {
 858                 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
 859                         (sk->retransmits == 0 || 
 860                          sk->ip_xmit_timeout != TIME_WRITE ||
 861                          before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
 862                         && sk->packets_out < sk->cong_window) 
 863                 {
 864                         /*
 865                          *      Add more data to the send queue.
 866                          */
 867                         flag |= 1;
 868                         tcp_write_xmit(sk);
 869                 }
 870                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
 871                         sk->send_head == NULL &&
 872                         sk->ack_backlog == 0 &&
 873                         sk->state != TCP_TIME_WAIT) 
 874                 {
 875                         /*
 876                          *      Data to queue but no room.
 877                          */
 878                         tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
 879                 }               
 880         }
 881         else
 882         {
 883                 /*
 884                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
 885                  * from TCP_CLOSE we don't do anything
 886                  *
 887                  * from anything else, if there is write data (or fin) pending,
 888                  * we use a TIME_WRITE timeout, else if keepalive we reset to
 889                  * a KEEPALIVE timeout, else we delete the timer.
 890                  *
 891                  * We do not set flag for nominal write data, otherwise we may
 892                  * force a state where we start to write itsy bitsy tidbits
 893                  * of data.
 894                  */
 895 
 896                 switch(sk->state) {
 897                 case TCP_TIME_WAIT:
 898                         /*
 899                          * keep us in TIME_WAIT until we stop getting packets,
 900                          * reset the timeout.
 901                          */
 902                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 903                         break;
 904                 case TCP_CLOSE:
 905                         /*
 906                          * don't touch the timer.
 907                          */
 908                         break;
 909                 default:
 910                         /*
 911                          *      Must check send_head, write_queue, and ack_backlog
 912                          *      to determine which timeout to use.
 913                          */
 914                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
 915                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 916                         } else if (sk->keepopen) {
 917                                 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 918                         } else {
 919                                 del_timer(&sk->retransmit_timer);
 920                                 sk->ip_xmit_timeout = 0;
 921                         }
 922                         break;
 923                 }
 924         }
 925 
 926         /*
 927          *      We have nothing queued but space to send. Send any partial
 928          *      packets immediately (end of Nagle rule application).
 929          */
 930          
 931         if (sk->packets_out == 0 && sk->partial != NULL &&
 932                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
 933         {
 934                 flag |= 1;
 935                 tcp_send_partial(sk);
 936         }
 937 
 938         /*
 939          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
 940          * we are now waiting for an acknowledge to our FIN.  The other end is
 941          * already in TIME_WAIT.
 942          *
 943          * Move to TCP_CLOSE on success.
 944          */
 945 
 946         if (sk->state == TCP_LAST_ACK) 
 947         {
 948                 if (!sk->dead)
 949                         sk->state_change(sk);
 950                 if(sk->debug)
 951                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
 952                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
 953                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
 954                 {
 955                         flag |= 1;
 956                         sk->shutdown = SHUTDOWN_MASK;
 957                         tcp_set_state(sk,TCP_CLOSE);
 958                         return 1;
 959                 }
 960         }
 961 
 962         /*
 963          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
 964          *
 965          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
 966          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
 967          */
 968 
 969         if (sk->state == TCP_FIN_WAIT1) 
 970         {
 971 
 972                 if (!sk->dead) 
 973                         sk->state_change(sk);
 974                 if (sk->rcv_ack_seq == sk->write_seq) 
 975                 {
 976                         flag |= 1;
 977                         sk->shutdown |= SEND_SHUTDOWN;
 978                         tcp_set_state(sk, TCP_FIN_WAIT2);
 979                 }
 980         }
 981 
 982         /*
 983          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
 984          *
 985          *      Move to TIME_WAIT
 986          */
 987 
 988         if (sk->state == TCP_CLOSING) 
 989         {
 990 
 991                 if (!sk->dead) 
 992                         sk->state_change(sk);
 993                 if (sk->rcv_ack_seq == sk->write_seq) 
 994                 {
 995                         flag |= 1;
 996                         tcp_time_wait(sk);
 997                 }
 998         }
 999         
1000         /*
1001          *      Final ack of a three way shake 
1002          */
1003          
1004         if(sk->state==TCP_SYN_RECV)
1005         {
1006                 tcp_set_state(sk, TCP_ESTABLISHED);
1007                 tcp_options(sk,th);
1008                 sk->dummy_th.dest=th->source;
1009                 sk->copied_seq = sk->acked_seq;
1010                 if(!sk->dead)
1011                         sk->state_change(sk);
1012                 if(sk->max_window==0)
1013                 {
1014                         sk->max_window=32;      /* Sanity check */
1015                         sk->mss=min(sk->max_window,sk->mtu);
1016                 }
1017         }
1018         
1019         /*
1020          * I make no guarantees about the first clause in the following
1021          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
1022          * what conditions "!flag" would be true.  However I think the rest
1023          * of the conditions would prevent that from causing any
1024          * unnecessary retransmission. 
1025          *   Clearly if the first packet has expired it should be 
1026          * retransmitted.  The other alternative, "flag&2 && retransmits", is
1027          * harder to explain:  You have to look carefully at how and when the
1028          * timer is set and with what timeout.  The most recent transmission always
1029          * sets the timer.  So in general if the most recent thing has timed
1030          * out, everything before it has as well.  So we want to go ahead and
1031          * retransmit some more.  If we didn't explicitly test for this
1032          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1033          * would not be true.  If you look at the pattern of timing, you can
1034          * show that rto is increased fast enough that the next packet would
1035          * almost never be retransmitted immediately.  Then you'd end up
1036          * waiting for a timeout to send each packet on the retransmission
1037          * queue.  With my implementation of the Karn sampling algorithm,
1038          * the timeout would double each time.  The net result is that it would
1039          * take a hideous amount of time to recover from a single dropped packet.
1040          * It's possible that there should also be a test for TIME_WRITE, but
1041          * I think as long as "send_head != NULL" and "retransmit" is on, we've
1042          * got to be in real retransmission mode.
1043          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
1044          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1045          * As long as no further losses occur, this seems reasonable.
1046          */
1047         
1048         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1049                (((flag&2) && sk->retransmits) ||
1050                (sk->send_head->when + sk->rto < jiffies))) 
1051         {
1052                 if(sk->send_head->when + sk->rto < jiffies)
1053                         tcp_retransmit(sk,0);   
1054                 else
1055                 {
1056                         tcp_do_retransmit(sk, 1);
1057                         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1058                 }
1059         }
1060 
1061         return 1;
1062 
1063 uninteresting_ack:
1064         if(sk->debug)
1065                 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1066                         
1067         /*
1068          *      Keepalive processing.
1069          */
1070                  
1071         if (after(ack, sk->sent_seq)) 
1072         {
1073                 return 0;
1074         }
1075                 
1076         /*
1077          *      Restart the keepalive timer.
1078          */
1079                  
1080         if (sk->keepopen) 
1081         {
1082                 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1083                         tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1084         }
1085         return 1;
1086 }
1087 
1088 
1089 /*
1090  *      Process the FIN bit. This now behaves as it is supposed to work
1091  *      and the FIN takes effect when it is validly part of sequence
1092  *      space. Not before when we get holes.
1093  *
1094  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1095  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1096  *      TIME-WAIT)
1097  *
1098  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1099  *      close and we go into CLOSING (and later onto TIME-WAIT)
1100  *
1101  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1102  *
1103  */
1104  
1105 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
1106 {
1107         sk->fin_seq = skb->end_seq;
1108 
1109         if (!sk->dead) 
1110         {
1111                 sk->state_change(sk);
1112                 sock_wake_async(sk->socket, 1);
1113         }
1114 
1115         switch(sk->state) 
1116         {
1117                 case TCP_SYN_RECV:
1118                 case TCP_SYN_SENT:
1119                 case TCP_ESTABLISHED:
1120                         /*
1121                          * move to CLOSE_WAIT, tcp_data() already handled
1122                          * sending the ack.
1123                          */
1124                         tcp_set_state(sk,TCP_CLOSE_WAIT);
1125                         if (th->rst)
1126                                 sk->shutdown = SHUTDOWN_MASK;
1127                         break;
1128 
1129                 case TCP_CLOSE_WAIT:
1130                 case TCP_CLOSING:
1131                         /*
1132                          * received a retransmission of the FIN, do
1133                          * nothing.
1134                          */
1135                         break;
1136                 case TCP_TIME_WAIT:
1137                         /*
1138                          * received a retransmission of the FIN,
1139                          * restart the TIME_WAIT timer.
1140                          */
1141                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1142                         return(0);
1143                 case TCP_FIN_WAIT1:
1144                         /*
1145                          * This case occurs when a simultaneous close
1146                          * happens, we must ack the received FIN and
1147                          * enter the CLOSING state.
1148                          *
1149                          * This causes a WRITE timeout, which will either
1150                          * move on to TIME_WAIT when we timeout, or resend
1151                          * the FIN properly (maybe we get rid of that annoying
1152                          * FIN lost hang). The TIME_WRITE code is already correct
1153                          * for handling this timeout.
1154                          */
1155 
1156                         if(sk->ip_xmit_timeout != TIME_WRITE)
1157                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1158                         tcp_set_state(sk,TCP_CLOSING);
1159                         break;
1160                 case TCP_FIN_WAIT2:
1161                         /*
1162                          * received a FIN -- send ACK and enter TIME_WAIT
1163                          */
1164                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1165                         sk->shutdown|=SHUTDOWN_MASK;
1166                         tcp_set_state(sk,TCP_TIME_WAIT);
1167                         break;
1168                 case TCP_CLOSE:
1169                         /*
1170                          * already in CLOSE
1171                          */
1172                         break;
1173                 default:
1174                         tcp_set_state(sk,TCP_LAST_ACK);
1175         
1176                         /* Start the timers. */
1177                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1178                         return(0);
1179         }
1180 
1181         return(0);
1182 }
1183 
1184 
1185 
1186 /*
1187  *      This routine handles the data.  If there is room in the buffer,
1188  *      it will be have already been moved into it.  If there is no
1189  *      room, then we will just have to discard the packet.
1190  */
1191 
1192 static int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
1193          unsigned long saddr, unsigned short len)
1194 {
1195         struct sk_buff *skb1, *skb2;
1196         struct tcphdr *th;
1197         int dup_dumped=0;
1198         u32 new_seq, shut_seq;
1199 
1200         th = skb->h.th;
1201         skb_pull(skb,th->doff*4);
1202         skb_trim(skb,len-(th->doff*4));
1203 
1204         /*
1205          *      The bytes in the receive read/assembly queue has increased. Needed for the
1206          *      low memory discard algorithm 
1207          */
1208            
1209         sk->bytes_rcv += skb->len;
1210         
1211         if (skb->len == 0 && !th->fin) 
1212         {
1213                 /* 
1214                  *      Don't want to keep passing ack's back and forth. 
1215                  *      (someone sent us dataless, boring frame)
1216                  */
1217                 if (!th->ack)
1218                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1219                 kfree_skb(skb, FREE_READ);
1220                 return(0);
1221         }
1222         
1223         /*
1224          *      We no longer have anyone receiving data on this connection.
1225          */
1226 
1227 #ifndef TCP_DONT_RST_SHUTDOWN            
1228 
1229         if(sk->shutdown & RCV_SHUTDOWN)
1230         {
1231                 /*
1232                  *      FIXME: BSD has some magic to avoid sending resets to
1233                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
1234                  *      BSD stacks still have broken keepalives so we want to
1235                  *      cope with it.
1236                  */
1237 
1238                 if(skb->len)    /* We don't care if it's just an ack or
1239                                    a keepalive/window probe */
1240                 {
1241                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
1242                         
1243                         /* Do this the way 4.4BSD treats it. Not what I'd
1244                            regard as the meaning of the spec but it's what BSD
1245                            does and clearly they know everything 8) */
1246 
1247                         /*
1248                          *      This is valid because of two things
1249                          *
1250                          *      a) The way tcp_data behaves at the bottom.
1251                          *      b) A fin takes effect when read not when received.
1252                          */
1253                          
1254                         shut_seq = sk->acked_seq+1;     /* Last byte */
1255                         
1256                         if(after(new_seq,shut_seq))
1257                         {
1258                                 if(sk->debug)
1259                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1260                                                 sk, new_seq, shut_seq, sk->blog);
1261                                 if(sk->dead)
1262                                 {
1263                                         sk->acked_seq = new_seq + th->fin;
1264                                         tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1265                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1266                                         tcp_statistics.TcpEstabResets++;
1267                                         sk->err = EPIPE;
1268                                         sk->error_report(sk);
1269                                         sk->shutdown = SHUTDOWN_MASK;
1270                                         tcp_set_state(sk,TCP_CLOSE);
1271                                         kfree_skb(skb, FREE_READ);
1272                                         return 0;
1273                                 }
1274                         }
1275                 }
1276         }
1277 
1278 #endif
1279 
1280         /*
1281          *      Now we have to walk the chain, and figure out where this one
1282          *      goes into it.  This is set up so that the last packet we received
1283          *      will be the first one we look at, that way if everything comes
1284          *      in order, there will be no performance loss, and if they come
1285          *      out of order we will be able to fit things in nicely.
1286          *
1287          *      [AC: This is wrong. We should assume in order first and then walk
1288          *       forwards from the first hole based upon real traffic patterns.]
1289          *      
1290          */
1291 
1292         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
1293         {
1294                 skb_queue_head(&sk->receive_queue,skb);
1295                 skb1= NULL;
1296         } 
1297         else
1298         {
1299                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
1300                 {
1301                         if(sk->debug)
1302                         {
1303                                 printk("skb1=%p :", skb1);
1304                                 printk("skb1->seq = %d: ", skb1->seq);
1305                                 printk("skb->seq = %d\n",skb->seq);
1306                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
1307                                                 sk->acked_seq);
1308                         }
1309                         
1310                         /*
1311                          *      Optimisation: Duplicate frame or extension of previous frame from
1312                          *      same sequence point (lost ack case).
1313                          *      The frame contains duplicate data or replaces a previous frame
1314                          *      discard the previous frame (safe as sk->users is set) and put
1315                          *      the new one in its place.
1316                          */
1317                          
1318                         if (skb->seq==skb1->seq && skb->len>=skb1->len)
1319                         {
1320                                 skb_append(skb1,skb);
1321                                 skb_unlink(skb1);
1322                                 kfree_skb(skb1,FREE_READ);
1323                                 dup_dumped=1;
1324                                 skb1=NULL;
1325                                 break;
1326                         }
1327                         
1328                         /*
1329                          *      Found where it fits
1330                          */
1331                          
1332                         if (after(skb->seq+1, skb1->seq))
1333                         {
1334                                 skb_append(skb1,skb);
1335                                 break;
1336                         }
1337                         
1338                         /*
1339                          *      See if we've hit the start. If so insert.
1340                          */
1341                         if (skb1 == skb_peek(&sk->receive_queue))
1342                         {
1343                                 skb_queue_head(&sk->receive_queue, skb);
1344                                 break;
1345                         }
1346                 }
1347         }
1348 
1349         /*
1350          *      Figure out what the ack value for this frame is
1351          */
1352          
1353         if (before(sk->acked_seq, sk->copied_seq)) 
1354         {
1355                 printk("*** tcp.c:tcp_data bug acked < copied\n");
1356                 sk->acked_seq = sk->copied_seq;
1357         }
1358 
1359         /*
1360          *      Now figure out if we can ack anything. This is very messy because we really want two
1361          *      receive queues, a completed and an assembly queue. We also want only one transmit
1362          *      queue.
1363          */
1364 
1365         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1)) 
1366         {
1367                 if (before(skb->seq, sk->acked_seq+1)) 
1368                 {
1369 
1370                         if (after(skb->end_seq, sk->acked_seq)) 
1371                                 sk->acked_seq = skb->end_seq;
1372 
1373                         skb->acked = 1;
1374 
1375                         /*
1376                          *      When we ack the fin, we do the FIN 
1377                          *      processing.
1378                          */
1379 
1380                         if (skb->h.th->fin) 
1381                         {
1382                                 tcp_fin(skb,sk,skb->h.th);
1383                         }
1384           
1385                         for(skb2 = skb->next;
1386                             skb2 != (struct sk_buff *)&sk->receive_queue;
1387                             skb2 = skb2->next) 
1388                         {
1389                                 if (before(skb2->seq, sk->acked_seq+1)) 
1390                                 {
1391                                         if (after(skb2->end_seq, sk->acked_seq))
1392                                                 sk->acked_seq = skb2->end_seq;
1393 
1394                                         skb2->acked = 1;
1395                                         /*
1396                                          *      When we ack the fin, we do
1397                                          *      the fin handling.
1398                                          */
1399                                         if (skb2->h.th->fin) 
1400                                         {
1401                                                 tcp_fin(skb,sk,skb->h.th);
1402                                         }
1403 
1404                                         /*
1405                                          *      Force an immediate ack.
1406                                          */
1407                                          
1408                                         sk->ack_backlog = sk->max_ack_backlog;
1409                                 }
1410                                 else
1411                                 {
1412                                         break;
1413                                 }
1414                         }
1415 
1416                         /*
1417                          *      This also takes care of updating the window.
1418                          *      This if statement needs to be simplified.
1419                          *
1420                          *      rules for delaying an ack:
1421                          *      - delay time <= 0.5 HZ
1422                          *      - we don't have a window update to send
1423                          *      - must send at least every 2 full sized packets
1424                          */
1425                         if (!sk->delay_acks ||
1426                             /* sk->ack_backlog >= sk->max_ack_backlog || */
1427                             sk->bytes_rcv > sk->max_unacked || th->fin ||
1428                             sk->ato > HZ/2 ||
1429                             tcp_raise_window(sk)) {
1430                                 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr);
1431                         }
1432                         else 
1433                         {       
1434                                 sk->ack_backlog++;
1435                         
1436                                 if(sk->debug)                           
1437                                         printk("Ack queued.\n");
1438                                 
1439                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);
1440                                 
1441                         }
1442                 }
1443         }
1444 
1445         /*
1446          *      If we've missed a packet, send an ack.
1447          *      Also start a timer to send another.
1448          */
1449          
1450         if (!skb->acked) 
1451         {
1452         
1453         /*
1454          *      This is important.  If we don't have much room left,
1455          *      we need to throw out a few packets so we have a good
1456          *      window.  Note that mtu is used, not mss, because mss is really
1457          *      for the send side.  He could be sending us stuff as large as mtu.
1458          */
1459                  
1460                 while (sock_rspace(sk) < sk->mtu) 
1461                 {
1462                         skb1 = skb_peek(&sk->receive_queue);
1463                         if (skb1 == NULL) 
1464                         {
1465                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
1466                                 break;
1467                         }
1468 
1469                         /*
1470                          *      Don't throw out something that has been acked. 
1471                          */
1472                  
1473                         if (skb1->acked) 
1474                         {
1475                                 break;
1476                         }
1477                 
1478                         skb_unlink(skb1);
1479                         kfree_skb(skb1, FREE_READ);
1480                 }
1481                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1482                 sk->ack_backlog++;
1483                 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1484         }
1485 
1486         /*
1487          *      Now tell the user we may have some data. 
1488          */
1489          
1490         if (!sk->dead) 
1491         {
1492                 if(sk->debug)
1493                         printk("Data wakeup.\n");
1494                 sk->data_ready(sk,0);
1495         } 
1496         return(0);
1497 }
1498 
1499 
1500 /*
1501  *      This routine is only called when we have urgent data
1502  *      signalled. Its the 'slow' part of tcp_urg. It could be
1503  *      moved inline now as tcp_urg is only called from one
1504  *      place. We handle URGent data wrong. We have to - as
1505  *      BSD still doesn't use the correction from RFC961.
1506  */
1507  
1508 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
1509 {
1510         u32 ptr = ntohs(th->urg_ptr);
1511 
1512         if (ptr)
1513                 ptr--;
1514         ptr += ntohl(th->seq);
1515 
1516         /* ignore urgent data that we've already seen and read */
1517         if (after(sk->copied_seq, ptr))
1518                 return;
1519 
1520         /* do we already have a newer (or duplicate) urgent pointer? */
1521         if (sk->urg_data && !after(ptr, sk->urg_seq))
1522                 return;
1523 
1524         /* tell the world about our new urgent pointer */
1525         if (sk->proc != 0) {
1526                 if (sk->proc > 0) {
1527                         kill_proc(sk->proc, SIGURG, 1);
1528                 } else {
1529                         kill_pg(-sk->proc, SIGURG, 1);
1530                 }
1531         }
1532         sk->urg_data = URG_NOTYET;
1533         sk->urg_seq = ptr;
1534 }
1535 
1536 /*
1537  *      This is the 'fast' part of urgent handling.
1538  */
1539  
1540 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
     /*  */
1541 {
1542         /*
1543          *      Check if we get a new urgent pointer - normally not 
1544          */
1545          
1546         if (th->urg)
1547                 tcp_check_urg(sk,th);
1548 
1549         /*
1550          *      Do we wait for any urgent data? - normally not
1551          */
1552          
1553         if (sk->urg_data == URG_NOTYET) {
1554                 u32 ptr;
1555 
1556                 /*
1557                  *      Is the urgent pointer pointing into this packet? 
1558                  */      
1559                 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1560                 if (ptr < len) {
1561                         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1562                         if (!sk->dead)
1563                                 sk->data_ready(sk,0);
1564                 }
1565         }
1566 }
1567 
1568 
1569 /*
1570  *      A TCP packet has arrived.
1571  *              skb->h.raw is the TCP header.
1572  */
1573  
1574 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
1575         __u32 daddr, unsigned short len,
1576         __u32 saddr, int redo, struct inet_protocol * protocol)
1577 {
1578         struct tcphdr *th;
1579         struct sock *sk;
1580         int syn_ok=0;
1581 
1582         /*
1583          * "redo" is 1 if we have already seen this skb but couldn't
1584          * use it at that time (the socket was locked).  In that case
1585          * we have already done a lot of the work (looked up the socket
1586          * etc).
1587          */
1588         th = skb->h.th;
1589         sk = skb->sk;
1590         if (!redo) {
1591                 tcp_statistics.TcpInSegs++;
1592                 if (skb->pkt_type!=PACKET_HOST)
1593                         goto discard_it;
1594 
1595                 /*
1596                  *      Pull up the IP header.
1597                  */
1598         
1599                 skb_pull(skb, skb->h.raw-skb->data);
1600 
1601                 /*
1602                  *      Try to use the device checksum if provided.
1603                  */
1604                 switch (skb->ip_summed) 
1605                 {
1606                         case CHECKSUM_NONE:
1607                                 skb->csum = csum_partial((char *)th, len, 0);
1608                         case CHECKSUM_HW:
1609                                 if (tcp_check(th, len, saddr, daddr, skb->csum))
1610                                         goto discard_it;
1611                         default:
1612                                 /* CHECKSUM_UNNECESSARY */
1613                 }
1614                 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1615                 if (!sk)
1616                         goto no_tcp_socket;
1617                 skb->sk = sk;
1618                 skb->seq = ntohl(th->seq);
1619                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1620                 skb->ack_seq = ntohl(th->ack_seq);
1621 
1622                 skb->acked = 0;
1623                 skb->used = 0;
1624                 skb->free = 1;
1625                 skb->saddr = daddr;
1626                 skb->daddr = saddr;
1627 
1628                 /* We may need to add it to the backlog here. */
1629                 if (sk->users) 
1630                 {
1631                         skb_queue_tail(&sk->back_log, skb);
1632                         return(0);
1633                 }
1634         }
1635 
1636         /*
1637          *      If this socket has got a reset it's to all intents and purposes 
1638          *      really dead. Count closed sockets as dead.
1639          *
1640          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1641          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
1642          *      exist so should cause resets as if the port was unreachable.
1643          */
1644 
1645         if (sk->zapped || sk->state==TCP_CLOSE)
1646                 goto no_tcp_socket;
1647 
1648         if (!sk->prot) 
1649         {
1650                 printk("IMPOSSIBLE 3\n");
1651                 return(0);
1652         }
1653 
1654 
1655         /*
1656          *      Charge the memory to the socket. 
1657          */
1658          
1659         skb->sk=sk;
1660         sk->rmem_alloc += skb->truesize;
1661         
1662         /*
1663          *      We should now do header prediction.
1664          */
1665          
1666         /*
1667          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1668          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1669          *      compatibility. We also set up variables more thoroughly [Karn notes in the
1670          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1671          */
1672 
1673         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
1674         {
1675         
1676                 /*
1677                  *      Now deal with unusual cases.
1678                  */
1679          
1680                 if(sk->state==TCP_LISTEN)
1681                 {
1682                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
1683                                 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1684 
1685                         /*
1686                          *      We don't care for RST, and non SYN are absorbed (old segments)
1687                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1688                          *      netmask on a running connection it can go broadcast. Even Sun's have
1689                          *      this problem so I'm ignoring it 
1690                          */
1691                            
1692                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1693                         {
1694                                 kfree_skb(skb, FREE_READ);
1695                                 return 0;
1696                         }
1697                 
1698                         /*      
1699                          *      Guess we need to make a new socket up 
1700                          */
1701                 
1702                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1703                 
1704                         /*
1705                          *      Now we have several options: In theory there is nothing else
1706                          *      in the frame. KA9Q has an option to send data with the syn,
1707                          *      BSD accepts data with the syn up to the [to be] advertised window
1708                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
1709                          *      it, that fits the spec precisely and avoids incompatibilities. It
1710                          *      would be nice in future to drop through and process the data.
1711                          *
1712                          *      Now TTCP is starting to use we ought to queue this data.
1713                          */
1714                          
1715                         return 0;
1716                 }
1717         
1718                 /* 
1719                  *      Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1720                  *      then its a new connection
1721                  */
1722                  
1723                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1724                 {
1725                         kfree_skb(skb, FREE_READ);
1726                         return 0;
1727                 }
1728                 
1729                 /*
1730                  *      SYN sent means we have to look for a suitable ack and either reset
1731                  *      for bad matches or go to connected. The SYN_SENT case is unusual and should
1732                  *      not be in line code. [AC]
1733                  */
1734            
1735                 if(sk->state==TCP_SYN_SENT)
1736                 {
1737                         /* Crossed SYN or previous junk segment */
1738                         if(th->ack)
1739                         {
1740                                 /* We got an ack, but it's not a good ack */
1741                                 if(!tcp_ack(sk,th,skb->ack_seq,len))
1742                                 {
1743                                         /* Reset the ack - its an ack from a 
1744                                            different connection  [ th->rst is checked in tcp_send_reset()] */
1745                                         tcp_statistics.TcpAttemptFails++;
1746                                         tcp_send_reset(daddr, saddr, th,
1747                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1748                                         kfree_skb(skb, FREE_READ);
1749                                         return(0);
1750                                 }
1751                                 if(th->rst)
1752                                         return tcp_reset(sk,skb);
1753                                 if(!th->syn)
1754                                 {
1755                                         /* A valid ack from a different connection
1756                                            start. Shouldn't happen but cover it */
1757                                         tcp_statistics.TcpAttemptFails++;
1758                                         tcp_send_reset(daddr, saddr, th,
1759                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1760                                         kfree_skb(skb, FREE_READ);
1761                                         return 0;
1762                                 }
1763                                 /*
1764                                  *      Ok.. it's good. Set up sequence numbers and
1765                                  *      move to established.
1766                                  */
1767                                 syn_ok=1;       /* Don't reset this connection for the syn */
1768                                 sk->acked_seq = skb->seq+1;
1769                                 sk->lastwin_seq = skb->seq+1;
1770                                 sk->fin_seq = skb->seq;
1771                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1772                                 tcp_set_state(sk, TCP_ESTABLISHED);
1773                                 tcp_options(sk,th);
1774                                 sk->dummy_th.dest=th->source;
1775                                 sk->copied_seq = sk->acked_seq;
1776                                 if(!sk->dead)
1777                                 {
1778                                         sk->state_change(sk);
1779                                         sock_wake_async(sk->socket, 0);
1780                                 }
1781                                 if(sk->max_window==0)
1782                                 {
1783                                         sk->max_window = 32;
1784                                         sk->mss = min(sk->max_window, sk->mtu);
1785                                 }
1786                         }
1787                         else
1788                         {
1789                                 /* See if SYN's cross. Drop if boring */
1790                                 if(th->syn && !th->rst)
1791                                 {
1792                                         /* Crossed SYN's are fine - but talking to
1793                                            yourself is right out... */
1794                                         if(sk->saddr==saddr && sk->daddr==daddr &&
1795                                                 sk->dummy_th.source==th->source &&
1796                                                 sk->dummy_th.dest==th->dest)
1797                                         {
1798                                                 tcp_statistics.TcpAttemptFails++;
1799                                                 return tcp_reset(sk,skb);
1800                                         }
1801                                         tcp_set_state(sk,TCP_SYN_RECV);
1802                                         
1803                                         /*
1804                                          *      FIXME:
1805                                          *      Must send SYN|ACK here
1806                                          */
1807                                 }               
1808                                 /* Discard junk segment */
1809                                 kfree_skb(skb, FREE_READ);
1810                                 return 0;
1811                         }
1812                         /*
1813                          *      SYN_RECV with data maybe.. drop through
1814                          */
1815                         goto rfc_step6;
1816                 }
1817 
1818         /*
1819          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1820          *      a more complex suggestion for fixing these reuse issues in RFC1644
1821          *      but not yet ready for general use. Also see RFC1379.
1822          *
1823          *      Note the funny way we go back to the top of this function for
1824          *      this case ("goto try_next_socket").  That also takes care of
1825          *      checking "sk->users" for the new socket as well as doing all
1826          *      the normal tests on the packet.
1827          */
1828         
1829 #define BSD_TIME_WAIT
1830 #ifdef BSD_TIME_WAIT
1831                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
1832                         after(skb->seq, sk->acked_seq) && !th->rst)
1833                 {
1834                         u32 seq = sk->write_seq;
1835                         if(sk->debug)
1836                                 printk("Doing a BSD time wait\n");
1837                         tcp_statistics.TcpEstabResets++;           
1838                         sk->rmem_alloc -= skb->truesize;
1839                         skb->sk = NULL;
1840                         sk->err=ECONNRESET;
1841                         tcp_set_state(sk, TCP_CLOSE);
1842                         sk->shutdown = SHUTDOWN_MASK;
1843                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1844                         /* this is not really correct: we should check sk->users */
1845                         if (sk && sk->state==TCP_LISTEN)
1846                         {
1847                                 skb->sk = sk;
1848                                 sk->rmem_alloc += skb->truesize;
1849                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1850                                 return 0;
1851                         }
1852                         kfree_skb(skb, FREE_READ);
1853                         return 0;
1854                 }
1855 #endif  
1856         }
1857 
1858         /*
1859          *      We are now in normal data flow (see the step list in the RFC)
1860          *      Note most of these are inline now. I'll inline the lot when
1861          *      I have time to test it hard and look at what gcc outputs 
1862          */
1863         
1864         if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1865         {
1866                 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1867                 kfree_skb(skb, FREE_READ);
1868                 return 0;
1869         }
1870 
1871         if(th->rst)
1872                 return tcp_reset(sk,skb);
1873         
1874         /*
1875          *      !syn_ok is effectively the state test in RFC793.
1876          */
1877          
1878         if(th->syn && !syn_ok)
1879         {
1880                 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1881                 return tcp_reset(sk,skb);       
1882         }
1883 
1884         tcp_delack_estimator(sk);
1885         
1886         /*
1887          *      Process the ACK
1888          */
1889          
1890 
1891         if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1892         {
1893                 /*
1894                  *      Our three way handshake failed.
1895                  */
1896                  
1897                 if(sk->state==TCP_SYN_RECV)
1898                 {
1899                         tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1900                 }
1901                 kfree_skb(skb, FREE_READ);
1902                 return 0;
1903         }
1904         
1905 rfc_step6:              /* I'll clean this up later */
1906 
1907         /*
1908          *      If the accepted buffer put us over our queue size we
1909          *      now drop it (we must process the ack first to avoid
1910          *      deadlock cases).
1911          */
1912          
1913         if (sk->rmem_alloc  >= sk->rcvbuf) 
1914         {
1915                 kfree_skb(skb, FREE_READ);
1916                 return(0);
1917         }
1918 
1919 
1920         /*
1921          *      Process urgent data
1922          */
1923                 
1924         tcp_urg(sk, th, len);
1925         
1926         /*
1927          *      Process the encapsulated data
1928          */
1929         
1930         if(tcp_data(skb,sk, saddr, len))
1931                 kfree_skb(skb, FREE_READ);
1932 
1933         /*
1934          *      And done
1935          */     
1936         
1937         return 0;
1938 
1939 no_tcp_socket:
1940         /*
1941          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1942          */
1943         tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1944 
1945 discard_it:
1946         /*
1947          *      Discard frame
1948          */
1949         skb->sk = NULL;
1950         kfree_skb(skb, FREE_READ);
1951         return 0;
1952 }
/* */
root/net/ipv4/tcp_input.c

DEFINITIONS