net/ipv4/tcp

/* */
This source file includes following definitions.
tcp_delack_estimator
tcp_rtt_estimator
tcp_cache_zap
get_tcp_sock
bad_tcp_sequence
tcp_sequence
tcp_reset
tcp_options
tcp_conn_request
tcp_window_shrunk
tcp_ack
tcp_fin
tcp_insert_skb
tcp_queue_ack
tcp_queue
tcp_data
tcp_check_urg
tcp_urg
tcp_remove_dups
prune_queue
tcp_rcv
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp_input.c 1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * FIXES
  23  *              Pedro Roque     :       Double ACK bug
  24  */
  25 
  26 #include <linux/config.h>
  27 #include <net/tcp.h>
  28 
  29 /*
  30  *      Policy code extracted so its now separate
  31  */
  32 
  33 /*
  34  *      Called each time to estimate the delayed ack timeout. This is
  35  *      how it should be done so a fast link isn't impacted by ack delay.
  36  */
  37  
  38 extern __inline__ void tcp_delack_estimator(struct sock *sk)
     /*  */
  39 {
  40         /*
  41          *      Delayed ACK time estimator.
  42          */
  43         
  44         if (sk->lrcvtime == 0) 
  45         {
  46                 sk->lrcvtime = jiffies;
  47                 sk->ato = HZ/3;
  48         }
  49         else 
  50         {
  51                 int m;
  52                 
  53                 m = jiffies - sk->lrcvtime;
  54 
  55                 sk->lrcvtime = jiffies;
  56 
  57                 if (m <= 0)
  58                         m = 1;
  59 
  60                 if (m > (sk->rtt >> 3)) 
  61                 {
  62                         sk->ato = sk->rtt >> 3;
  63                         /*
  64                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
  65                          */
  66                 }
  67                 else 
  68                 {
  69                         sk->ato = (sk->ato >> 1) + m;
  70                         /*
  71                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
  72                          */
  73                 }
  74         }
  75 }
  76 
  77 /*
  78  *      Called on frames that were known _not_ to have been
  79  *      retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
  80  *      The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
  81  */
  82  
  83 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
     /*  */
  84 {
  85         long m;
  86         /*
  87          *      The following amusing code comes from Jacobson's
  88          *      article in SIGCOMM '88.  Note that rtt and mdev
  89          *      are scaled versions of rtt and mean deviation.
  90          *      This is designed to be as fast as possible 
  91          *      m stands for "measurement".
  92          */
  93         
  94         m = jiffies - oskb->when;  /* RTT */
  95         if (sk->rtt != 0) {
  96                 if(m<=0)
  97                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
  98                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
  99                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
 100                 if (m < 0)
 101                         m = -m;         /* m is now abs(error) */
 102                 m -= (sk->mdev >> 2);   /* similar update on mdev */
 103                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 104         } else {
 105                 /* no previous measure. */
 106                 sk->rtt = m<<3;         /* take the measured time to be rtt */
 107                 sk->mdev = m<<2;        /* make sure rto = 3*rtt */
 108         }
 109 
 110         /*
 111          *      Now update timeout.  Note that this removes any backoff.
 112          */
 113                          
 114         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 115         if (sk->rto > 120*HZ)
 116                 sk->rto = 120*HZ;
 117         if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
 118                 sk->rto = HZ/5;
 119         sk->backoff = 0;
 120 }
 121 
 122 /*
 123  *      Cached last hit socket
 124  */
 125  
 126 static volatile unsigned long   th_cache_saddr, th_cache_daddr;
 127 static volatile unsigned short  th_cache_dport, th_cache_sport;
 128 static volatile struct sock *th_cache_sk;
 129 
 130 void tcp_cache_zap(void)
     /*  */
 131 {
 132         th_cache_sk=NULL;
 133 }
 134 
 135 /*
 136  *      Find the socket, using the last hit cache if applicable. The cache is not quite
 137  *      right...
 138  */
 139 
 140 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
     /*  */
 141 {
 142         struct sock * sk;
 143 
 144         sk = (struct sock *) th_cache_sk;
 145         if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
 146             sport != th_cache_sport || dport != th_cache_dport) {
 147                 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
 148                 if (sk) {
 149                         th_cache_saddr=saddr;
 150                         th_cache_daddr=daddr;
 151                         th_cache_dport=dport;
 152                         th_cache_sport=sport;
 153                         th_cache_sk=sk;
 154                 }
 155         }
 156         return sk;
 157 }
 158 
 159 /*
 160  * React to a out-of-window TCP sequence number in an incoming packet
 161  */
 162  
 163 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, u32 end_seq,
     /*  */
 164               struct device *dev)
 165 {
 166         if (th->rst)
 167                 return;
 168 
 169         /*
 170          *      Send a reset if we get something not ours and we are
 171          *      unsynchronized. Note: We don't do anything to our end. We
 172          *      are just killing the bogus remote connection then we will
 173          *      connect again and it will work (with luck).
 174          */
 175          
 176         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
 177         {
 178                 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
 179                 return;
 180         }
 181 
 182         /*
 183          *      4.3reno machines look for these kind of acks so they can do fast
 184          *      recovery. Three identical 'old' acks lets it know that one frame has
 185          *      been lost and should be resent. Because this is before the whole window
 186          *      of data has timed out it can take one lost frame per window without
 187          *      stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
 188          */
 189         tcp_send_ack(sk);
 190 }
 191 
 192 /*
 193  *      This functions checks to see if the tcp header is actually acceptable. 
 194  */
 195  
 196 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
     /*  */
 197 {
 198         u32 end_window = sk->acked_seq + sk->window;
 199         return  /* if start is at end of window, end must be too (zero window) */
 200                 (seq == end_window && seq == end_seq) ||
 201                 /* if start is before end of window, check for interest */
 202                 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
 203 }
 204 
 205 /*
 206  *      When we get a reset we do this. This probably is a tcp_output routine
 207  *      really.
 208  */
 209 
 210 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
 211 {
 212         sk->zapped = 1;
 213         /*
 214          *      We want the right error as BSD sees it (and indeed as we do).
 215          */
 216         sk->err = ECONNRESET;
 217         if (sk->state == TCP_SYN_SENT)
 218                 sk->err = ECONNREFUSED;
 219         if (sk->state == TCP_CLOSE_WAIT)
 220                 sk->err = EPIPE;
 221 #ifdef CONFIG_TCP_RFC1337
 222         /*
 223          *      Time wait assassination protection [RFC1337]
 224          *
 225          *      This is a good idea, but causes more sockets to take time to close.
 226          *
 227          *      Ian Heavens has since shown this is an inadequate fix for the protocol
 228          *      bug in question.
 229          */
 230         if(sk->state!=TCP_TIME_WAIT)
 231         {       
 232                 tcp_set_state(sk,TCP_CLOSE);
 233                 sk->shutdown = SHUTDOWN_MASK;
 234         }
 235 #else   
 236         tcp_set_state(sk,TCP_CLOSE);
 237         sk->shutdown = SHUTDOWN_MASK;
 238 #endif  
 239         if (!sk->dead) 
 240                 sk->state_change(sk);
 241         kfree_skb(skb, FREE_READ);
 242         return(0);
 243 }
 244 
 245 
 246 /*
 247  *      Look for tcp options. Parses everything but only knows about MSS.
 248  *      This routine is always called with the packet containing the SYN.
 249  *      However it may also be called with the ack to the SYN.  So you
 250  *      can't assume this is always the SYN.  It's always called after
 251  *      we have set up sk->mtu to our own MTU.
 252  *
 253  *      We need at minimum to add PAWS support here. Possibly large windows
 254  *      as Linux gets deployed on 100Mb/sec networks.
 255  */
 256  
 257 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
 258 {
 259         unsigned char *ptr;
 260         int length=(th->doff*4)-sizeof(struct tcphdr);
 261         int mss_seen = 0;
 262     
 263         ptr = (unsigned char *)(th + 1);
 264   
 265         while(length>0)
 266         {
 267                 int opcode=*ptr++;
 268                 int opsize=*ptr++;
 269                 switch(opcode)
 270                 {
 271                         case TCPOPT_EOL:
 272                                 return;
 273                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 274                                 length--;
 275                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
 276                                 continue;
 277                         
 278                         default:
 279                                 if(opsize<=2)   /* Avoid silly options looping forever */
 280                                         return;
 281                                 switch(opcode)
 282                                 {
 283                                         case TCPOPT_MSS:
 284                                                 if(opsize==4 && th->syn)
 285                                                 {
 286                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
 287                                                         mss_seen = 1;
 288                                                 }
 289                                                 break;
 290                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
 291                                 }
 292                                 ptr+=opsize-2;
 293                                 length-=opsize;
 294                 }
 295         }
 296         if (th->syn) 
 297         {
 298                 if (! mss_seen)
 299                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
 300         }
 301 #ifdef CONFIG_INET_PCTCP
 302         sk->mss = min(sk->max_window >> 1, sk->mtu);
 303 #else    
 304         sk->mss = min(sk->max_window, sk->mtu);
 305         sk->max_unacked = 2 * sk->mss;
 306 #endif  
 307 }
 308 
 309 
 310 /*
 311  *      This routine handles a connection request.
 312  *      It should make sure we haven't already responded.
 313  *      Because of the way BSD works, we have to send a syn/ack now.
 314  *      This also means it will be harder to close a socket which is
 315  *      listening.
 316  */
 317  
 318 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
 319                  u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
 320 {
 321         struct sock *newsk;
 322         struct tcphdr *th;
 323         struct rtable *rt;
 324   
 325         th = skb->h.th;
 326 
 327         /* If the socket is dead, don't accept the connection. */
 328         if (!sk->dead) 
 329         {
 330                 sk->data_ready(sk,0);
 331         }
 332         else 
 333         {
 334                 if(sk->debug)
 335                         printk("Reset on %p: Connect on dead socket.\n",sk);
 336                 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
 337                 tcp_statistics.TcpAttemptFails++;
 338                 kfree_skb(skb, FREE_READ);
 339                 return;
 340         }
 341 
 342         /*
 343          *      Make sure we can accept more.  This will prevent a
 344          *      flurry of syns from eating up all our memory.
 345          *
 346          *      BSD does some funnies here and allows 3/2 times the
 347          *      set backlog as a fudge factor. Thats just too gross.
 348          */
 349 
 350         if (sk->ack_backlog >= sk->max_ack_backlog) 
 351         {
 352                 tcp_statistics.TcpAttemptFails++;
 353                 kfree_skb(skb, FREE_READ);
 354                 return;
 355         }
 356 
 357         /*
 358          * We need to build a new sock struct.
 359          * It is sort of bad to have a socket without an inode attached
 360          * to it, but the wake_up's will just wake up the listening socket,
 361          * and if the listening socket is destroyed before this is taken
 362          * off of the queue, this will take care of it.
 363          */
 364 
 365         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
 366         if (newsk == NULL) 
 367         {
 368                 /* just ignore the syn.  It will get retransmitted. */
 369                 tcp_statistics.TcpAttemptFails++;
 370                 kfree_skb(skb, FREE_READ);
 371                 return;
 372         }
 373 
 374         memcpy(newsk, sk, sizeof(*newsk));
 375         newsk->opt = NULL;
 376         newsk->ip_route_cache  = NULL;
 377         if (opt && opt->optlen) 
 378         {
 379                 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
 380                 if (!sk->opt) 
 381                 {
 382                         kfree_s(newsk, sizeof(struct sock));
 383                         tcp_statistics.TcpAttemptFails++;
 384                         kfree_skb(skb, FREE_READ);
 385                         return;
 386                 }
 387                 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
 388                 {
 389                         kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
 390                         kfree_s(newsk, sizeof(struct sock));
 391                         tcp_statistics.TcpAttemptFails++;
 392                         kfree_skb(skb, FREE_READ);
 393                         return;
 394                 }
 395         }
 396         skb_queue_head_init(&newsk->write_queue);
 397         skb_queue_head_init(&newsk->receive_queue);
 398         newsk->send_head = NULL;
 399         newsk->send_tail = NULL;
 400         skb_queue_head_init(&newsk->back_log);
 401         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
 402         newsk->rto = TCP_TIMEOUT_INIT;
 403         newsk->mdev = TCP_TIMEOUT_INIT<<1;
 404         newsk->max_window = 0;
 405         newsk->cong_window = 1;
 406         newsk->cong_count = 0;
 407         newsk->ssthresh = 0;
 408         newsk->backoff = 0;
 409         newsk->blog = 0;
 410         newsk->intr = 0;
 411         newsk->proc = 0;
 412         newsk->done = 0;
 413         newsk->partial = NULL;
 414         newsk->pair = NULL;
 415         newsk->wmem_alloc = 0;
 416         newsk->rmem_alloc = 0;
 417         newsk->localroute = sk->localroute;
 418 
 419         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
 420 
 421         newsk->err = 0;
 422         newsk->shutdown = 0;
 423         newsk->ack_backlog = 0;
 424         newsk->acked_seq = skb->seq+1;
 425         newsk->lastwin_seq = skb->seq+1;
 426         newsk->delay_acks = 1;
 427         newsk->copied_seq = skb->seq+1;
 428         newsk->fin_seq = skb->seq;
 429         newsk->state = TCP_SYN_RECV;
 430         newsk->timeout = 0;
 431         newsk->ip_xmit_timeout = 0;
 432         newsk->write_seq = seq; 
 433         newsk->window_seq = newsk->write_seq;
 434         newsk->rcv_ack_seq = newsk->write_seq;
 435         newsk->urg_data = 0;
 436         newsk->retransmits = 0;
 437         newsk->linger=0;
 438         newsk->destroy = 0;
 439         init_timer(&newsk->timer);
 440         newsk->timer.data = (unsigned long)newsk;
 441         newsk->timer.function = &net_timer;
 442         init_timer(&newsk->delack_timer);
 443         newsk->delack_timer.data = (unsigned long)newsk;
 444         newsk->delack_timer.function = tcp_delack_timer;
 445         init_timer(&newsk->retransmit_timer);
 446         newsk->retransmit_timer.data = (unsigned long)newsk;
 447         newsk->retransmit_timer.function = tcp_retransmit_timer;
 448         newsk->dummy_th.source = skb->h.th->dest;
 449         newsk->dummy_th.dest = skb->h.th->source;
 450         
 451         /*
 452          *      Swap these two, they are from our point of view. 
 453          */
 454          
 455         newsk->daddr = saddr;
 456         newsk->saddr = daddr;
 457         newsk->rcv_saddr = daddr;
 458 
 459         put_sock(newsk->num,newsk);
 460         newsk->acked_seq = skb->seq + 1;
 461         newsk->copied_seq = skb->seq + 1;
 462         newsk->socket = NULL;
 463 
 464         /*
 465          *      Grab the ttl and tos values and use them 
 466          */
 467 
 468         newsk->ip_ttl=sk->ip_ttl;
 469         newsk->ip_tos=skb->ip_hdr->tos;
 470 
 471         /*
 472          *      Use 512 or whatever user asked for 
 473          */
 474 
 475         /*
 476          *      Note use of sk->user_mss, since user has no direct access to newsk 
 477          */
 478 
 479         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
 480         newsk->ip_route_cache = rt;
 481         
 482         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
 483                 newsk->window_clamp = rt->rt_window;
 484         else
 485                 newsk->window_clamp = 0;
 486                 
 487         if (sk->user_mss)
 488                 newsk->mtu = sk->user_mss;
 489         else if (rt)
 490                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 491         else 
 492                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
 493 
 494         /*
 495          *      But not bigger than device MTU 
 496          */
 497 
 498         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
 499 
 500 #ifdef CONFIG_SKIP
 501         
 502         /*
 503          *      SKIP devices set their MTU to 65535. This is so they can take packets
 504          *      unfragmented to security process then fragment. They could lie to the
 505          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
 506          *      simply because the final package we want unfragmented is going to be
 507          *
 508          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
 509          */
 510          
 511         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
 512                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
 513 #endif
 514         /*
 515          *      This will min with what arrived in the packet 
 516          */
 517 
 518         tcp_options(newsk,skb->h.th);
 519         
 520         tcp_cache_zap();
 521         tcp_send_synack(newsk, sk, skb);
 522 }
 523 
 524 
 525 /*
 526  * Handle a TCP window that shrunk on us. It shouldn't happen,
 527  * but..
 528  *
 529  * We may need to move packets from the send queue
 530  * to the write queue, if the window has been shrunk on us.
 531  * The RFC says you are not allowed to shrink your window
 532  * like this, but if the other end does, you must be able
 533  * to deal with it.
 534  */
 535 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
     /*  */
 536 {
 537         struct sk_buff *skb;
 538         struct sk_buff *skb2;
 539         struct sk_buff *wskb = NULL;
 540         
 541         skb2 = sk->send_head;
 542         sk->send_head = NULL;
 543         sk->send_tail = NULL;
 544 
 545         /*
 546          *      This is an artifact of a flawed concept. We want one
 547          *      queue and a smarter send routine when we send all.
 548          */
 549         cli();
 550         while (skb2 != NULL) 
 551         {
 552                 skb = skb2;
 553                 skb2 = skb->link3;
 554                 skb->link3 = NULL;
 555                 if (after(skb->end_seq, window_seq)) 
 556                 {
 557                         if (sk->packets_out > 0) 
 558                                 sk->packets_out--;
 559                         /* We may need to remove this from the dev send list. */
 560                         if (skb->next != NULL) 
 561                         {
 562                                 skb_unlink(skb);                                
 563                         }
 564                         /* Now add it to the write_queue. */
 565                         if (wskb == NULL)
 566                                 skb_queue_head(&sk->write_queue,skb);
 567                         else
 568                                 skb_append(wskb,skb);
 569                         wskb = skb;
 570                 } 
 571                 else 
 572                 {
 573                         if (sk->send_head == NULL) 
 574                         {
 575                                 sk->send_head = skb;
 576                                 sk->send_tail = skb;
 577                         }
 578                         else
 579                         {
 580                                 sk->send_tail->link3 = skb;
 581                                 sk->send_tail = skb;
 582                         }
 583                         skb->link3 = NULL;
 584                 }
 585         }
 586         sti();
 587 }
 588 
 589 
 590 /*
 591  *      This routine deals with incoming acks, but not outgoing ones.
 592  *
 593  *      This routine is totally _WRONG_. The list structuring is wrong,
 594  *      the algorithm is wrong, the code is wrong.
 595  */
 596 
 597 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
     /*  */
 598 {
 599         int flag = 0;
 600         u32 window_seq;
 601 
 602         /* 
 603          * 1 - there was data in packet as well as ack or new data is sent or 
 604          *     in shutdown state
 605          * 2 - data from retransmit queue was acked and removed
 606          * 4 - window shrunk or data from retransmit queue was acked and removed
 607          */
 608 
 609         if(sk->zapped)
 610                 return(1);      /* Dead, cant ack any more so why bother */
 611 
 612         /*
 613          *      We have dropped back to keepalive timeouts. Thus we have
 614          *      no retransmits pending.
 615          */
 616          
 617         if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
 618                 sk->retransmits = 0;
 619 
 620         /*
 621          *      If the ack is newer than sent or older than previous acks
 622          *      then we can probably ignore it.
 623          */
 624          
 625         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
 626                 goto uninteresting_ack;
 627 
 628         /*
 629          *      If there is data set flag 1
 630          */
 631          
 632         if (len != th->doff*4) 
 633                 flag |= 1;
 634 
 635         /*
 636          *      Have we discovered a larger window
 637          */
 638         window_seq = ntohs(th->window);
 639         if (window_seq > sk->max_window) 
 640         {
 641                 sk->max_window = window_seq;
 642 #ifdef CONFIG_INET_PCTCP
 643                 /* Hack because we don't send partial packets to non SWS
 644                    handling hosts */
 645                 sk->mss = min(window_seq>>1, sk->mtu);
 646 #else
 647                 sk->mss = min(window_seq, sk->mtu);
 648 #endif  
 649         }
 650         window_seq += ack;
 651 
 652         /*
 653          *      See if our window has been shrunk. 
 654          */
 655         if (after(sk->window_seq, window_seq)) {
 656                 flag |= 4;
 657                 tcp_window_shrunk(sk, window_seq);
 658         }
 659 
 660         /*
 661          *      Pipe has emptied
 662          */      
 663         if (sk->send_tail == NULL || sk->send_head == NULL) 
 664         {
 665                 sk->send_head = NULL;
 666                 sk->send_tail = NULL;
 667                 sk->packets_out= 0;
 668         }
 669 
 670         /*
 671          *      We don't want too many packets out there. 
 672          */
 673          
 674         if (sk->ip_xmit_timeout == TIME_WRITE && 
 675                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
 676         {
 677                 
 678                 /* 
 679                  * This is Jacobson's slow start and congestion avoidance. 
 680                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
 681                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
 682                  * counter and increment it once every cwnd times.  It's possible
 683                  * that this should be done only if sk->retransmits == 0.  I'm
 684                  * interpreting "new data is acked" as including data that has
 685                  * been retransmitted but is just now being acked.
 686                  */
 687                 if (sk->cong_window < sk->ssthresh)  
 688                         /* 
 689                          *      In "safe" area, increase
 690                          */
 691                         sk->cong_window++;
 692                 else 
 693                 {
 694                         /*
 695                          *      In dangerous area, increase slowly.  In theory this is
 696                          *      sk->cong_window += 1 / sk->cong_window
 697                          */
 698                         if (sk->cong_count >= sk->cong_window) 
 699                         {
 700                                 sk->cong_window++;
 701                                 sk->cong_count = 0;
 702                         }
 703                         else 
 704                                 sk->cong_count++;
 705                 }
 706         }
 707 
 708         /*
 709          *      Remember the highest ack received and update the
 710          *      right hand window edge of the host.
 711          *      We do a bit of work here to track number of times we've
 712          *      seen this ack without a change in the right edge of the
 713          *      window and no data in the packet.
 714          *      This will allow us to do fast retransmits.
 715          */
 716 
 717         /* We are looking for duplicate ACKs here.
 718          * An ACK is a duplicate if:
 719          * (1) it has the same sequence number as the largest number we've seen,
 720          * (2) it has the same window as the last ACK,
 721          * (3) we have outstanding data that has not been ACKed
 722          * (4) The packet was not carrying any data.
 723          * I've tried to order these in occurrence of most likely to fail
 724          * to least likely to fail.
 725          * [These are the rules BSD stacks use to determine if an ACK is a
 726          *  duplicate.]
 727          */
 728 
 729         if (sk->rcv_ack_seq == ack
 730                 && sk->window_seq == window_seq
 731                 && !(flag&1)
 732                 && before(ack, sk->sent_seq))
 733         {
 734                 /* See draft-stevens-tcpca-spec-01 for explanation
 735                  * of what we are doing here.
 736                  */
 737                 sk->rcv_ack_cnt++;
 738                 if (sk->rcv_ack_cnt == MAX_DUP_ACKS+1) {
 739                         sk->ssthresh = max(sk->cong_window >> 1, 2);
 740                         sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1;
 741                         tcp_do_retransmit(sk,0);
 742                         /* reduce the count. We don't want to be
 743                         * seen to be in "retransmit" mode if we
 744                         * are doing a fast retransmit.
 745                         */
 746                         sk->retransmits--;
 747                 } else if (sk->rcv_ack_cnt > MAX_DUP_ACKS+1) {
 748                         sk->cong_window++;
 749                         /*
 750                         * At this point we are suppose to transmit a NEW
 751                         * packet (not retransmit the missing packet,
 752                         * this would only get us into a retransmit war.)
 753                         * I think that having just adjusted cong_window
 754                         * we will transmit the new packet below.
 755                         */
 756                 }
 757         }
 758         else
 759         {
 760                 if (sk->rcv_ack_cnt > MAX_DUP_ACKS) {
 761                         sk->cong_window = sk->ssthresh;
 762                 }
 763                 sk->window_seq = window_seq;
 764                 sk->rcv_ack_seq = ack;
 765                 sk->rcv_ack_cnt = 1;
 766         }
 767         
 768         /*
 769          *      We passed data and got it acked, remove any soft error
 770          *      log. Something worked...
 771          */
 772          
 773         sk->err_soft = 0;
 774 
 775         /*
 776          *      If this ack opens up a zero window, clear backoff.  It was
 777          *      being used to time the probes, and is probably far higher than
 778          *      it needs to be for normal retransmission.
 779          */
 780 
 781         if (sk->ip_xmit_timeout == TIME_PROBE0) 
 782         {
 783                 sk->retransmits = 0;    /* Our probe was answered */
 784                 
 785                 /*
 786                  *      Was it a usable window open ?
 787                  */
 788                  
 789                 if (!skb_queue_empty(&sk->write_queue) &&   /* should always be true */
 790                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
 791                 {
 792                         sk->backoff = 0;
 793                         
 794                         /*
 795                          *      Recompute rto from rtt.  this eliminates any backoff.
 796                          */
 797 
 798                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 799                         if (sk->rto > 120*HZ)
 800                                 sk->rto = 120*HZ;
 801                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
 802                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
 803                                                    .2 of a second is going to need huge windows (SIGH) */
 804                         sk->rto = HZ/5;
 805                 }
 806         }
 807 
 808         /* 
 809          *      See if we can take anything off of the retransmit queue.
 810          */
 811 
 812         for (;;) {
 813                 struct sk_buff * skb = sk->send_head;
 814                 if (!skb)
 815                         break;
 816 
 817                 /* Check for a bug. */
 818                 if (skb->link3 && after(skb->end_seq, skb->link3->end_seq)) 
 819                         printk("INET: tcp.c: *** bug send_list out of order.\n");
 820                         
 821                 /*
 822                  *      If our packet is before the ack sequence we can
 823                  *      discard it as it's confirmed to have arrived the other end.
 824                  */
 825                  
 826                 if (after(skb->end_seq, ack))
 827                         break;
 828 
 829                 if (sk->retransmits) 
 830                 {       
 831                         /*
 832                          *      We were retransmitting.  don't count this in RTT est 
 833                          */
 834                         flag |= 2;
 835                 }
 836 
 837                 if ((sk->send_head = skb->link3) == NULL)
 838                 {
 839                         sk->send_tail = NULL;
 840                         sk->retransmits = 0;
 841                 }
 842                 /*
 843                  * Note that we only reset backoff and rto in the
 844                  * rtt recomputation code.  And that doesn't happen
 845                  * if there were retransmissions in effect.  So the
 846                  * first new packet after the retransmissions is
 847                  * sent with the backoff still in effect.  Not until
 848                  * we get an ack from a non-retransmitted packet do
 849                  * we reset the backoff and rto.  This allows us to deal
 850                  * with a situation where the network delay has increased
 851                  * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 852                  */
 853 
 854                 /*
 855                  *      We have one less packet out there. 
 856                  */
 857                          
 858                 if (sk->packets_out > 0) 
 859                         sk->packets_out --;
 860 
 861                 if (!(flag&2))  /* Not retransmitting */
 862                         tcp_rtt_estimator(sk,skb);
 863                 flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
 864                                    In this case as we just set it up */
 865                 IS_SKB(skb);
 866 
 867                 /*
 868                  *      We may need to remove this from the dev send list. 
 869                  */
 870                 cli();
 871                 if (skb->next)
 872                         skb_unlink(skb);
 873                 sti();
 874                 kfree_skb(skb, FREE_WRITE); /* write. */
 875                 if (!sk->dead)
 876                         sk->write_space(sk);
 877         }
 878 
 879         /*
 880          * XXX someone ought to look at this too.. at the moment, if skb_peek()
 881          * returns non-NULL, we complete ignore the timer stuff in the else
 882          * clause.  We ought to organize the code so that else clause can
 883          * (should) be executed regardless, possibly moving the PROBE timer
 884          * reset over.  The skb_peek() thing should only move stuff to the
 885          * write queue, NOT also manage the timer functions.
 886          */
 887 
 888         /*
 889          * Maybe we can take some stuff off of the write queue,
 890          * and put it onto the xmit queue.
 891          */
 892         if (skb_peek(&sk->write_queue) != NULL) 
 893         {
 894                 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
 895                         (sk->retransmits == 0 || 
 896                          sk->ip_xmit_timeout != TIME_WRITE ||
 897                          !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
 898                         && sk->packets_out < sk->cong_window) 
 899                 {
 900                         /*
 901                          *      Add more data to the send queue.
 902                          */
 903                         flag |= 1;
 904                         tcp_write_xmit(sk);
 905                 }
 906                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
 907                         sk->send_head == NULL &&
 908                         sk->ack_backlog == 0 &&
 909                         sk->state != TCP_TIME_WAIT) 
 910                 {
 911                         /*
 912                          *      Data to queue but no room.
 913                          */
 914                         tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
 915                 }               
 916         }
 917         else
 918         {
 919                 /*
 920                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
 921                  * from TCP_CLOSE we don't do anything
 922                  *
 923                  * from anything else, if there is write data (or fin) pending,
 924                  * we use a TIME_WRITE timeout, else if keepalive we reset to
 925                  * a KEEPALIVE timeout, else we delete the timer.
 926                  *
 927                  * We do not set flag for nominal write data, otherwise we may
 928                  * force a state where we start to write itsy bitsy tidbits
 929                  * of data.
 930                  */
 931 
 932                 switch(sk->state) {
 933                 case TCP_TIME_WAIT:
 934                         /*
 935                          * keep us in TIME_WAIT until we stop getting packets,
 936                          * reset the timeout.
 937                          */
 938                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 939                         break;
 940                 case TCP_CLOSE:
 941                         /*
 942                          * don't touch the timer.
 943                          */
 944                         break;
 945                 default:
 946                         /*
 947                          *      Must check send_head and write_queue
 948                          *      to determine which timeout to use.
 949                          */
 950                         if (sk->send_head || !skb_queue_empty(&sk->write_queue)) {
 951                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 952                         } else if (sk->keepopen) {
 953                                 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 954                         } else {
 955                                 del_timer(&sk->retransmit_timer);
 956                                 sk->ip_xmit_timeout = 0;
 957                         }
 958                         break;
 959                 }
 960         }
 961 
 962         /*
 963          *      We have nothing queued but space to send. Send any partial
 964          *      packets immediately (end of Nagle rule application).
 965          */
 966          
 967         if (sk->packets_out == 0
 968             && sk->partial != NULL
 969             && skb_queue_empty(&sk->write_queue)
 970             && sk->send_head == NULL) 
 971         {
 972                 flag |= 1;
 973                 tcp_send_partial(sk);
 974         }
 975 
 976         /*
 977          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
 978          * we are now waiting for an acknowledge to our FIN.  The other end is
 979          * already in TIME_WAIT.
 980          *
 981          * Move to TCP_CLOSE on success.
 982          */
 983 
 984         if (sk->state == TCP_LAST_ACK) 
 985         {
 986                 if (!sk->dead)
 987                         sk->state_change(sk);
 988                 if(sk->debug)
 989                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
 990                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
 991                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
 992                 {
 993                         flag |= 1;
 994                         sk->shutdown = SHUTDOWN_MASK;
 995                         tcp_set_state(sk,TCP_CLOSE);
 996                         return 1;
 997                 }
 998         }
 999 
1000         /*
1001          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
1002          *
1003          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
1004          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
1005          */
1006 
1007         if (sk->state == TCP_FIN_WAIT1) 
1008         {
1009 
1010                 if (!sk->dead) 
1011                         sk->state_change(sk);
1012                 if (sk->rcv_ack_seq == sk->write_seq) 
1013                 {
1014                         flag |= 1;
1015                         sk->shutdown |= SEND_SHUTDOWN;
1016                         tcp_set_state(sk, TCP_FIN_WAIT2);
1017                 }
1018         }
1019 
1020         /*
1021          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
1022          *
1023          *      Move to TIME_WAIT
1024          */
1025 
1026         if (sk->state == TCP_CLOSING) 
1027         {
1028 
1029                 if (!sk->dead) 
1030                         sk->state_change(sk);
1031                 if (sk->rcv_ack_seq == sk->write_seq) 
1032                 {
1033                         flag |= 1;
1034                         tcp_time_wait(sk);
1035                 }
1036         }
1037         
1038         /*
1039          *      Final ack of a three way shake 
1040          */
1041          
1042         if(sk->state==TCP_SYN_RECV)
1043         {
1044                 tcp_set_state(sk, TCP_ESTABLISHED);
1045                 tcp_options(sk,th);
1046                 sk->dummy_th.dest=th->source;
1047                 sk->copied_seq = sk->acked_seq;
1048                 if(!sk->dead)
1049                         sk->state_change(sk);
1050                 if(sk->max_window==0)
1051                 {
1052                         sk->max_window=32;      /* Sanity check */
1053                         sk->mss=min(sk->max_window,sk->mtu);
1054                 }
1055         }
1056         
1057         /*
1058          * I make no guarantees about the first clause in the following
1059          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
1060          * what conditions "!flag" would be true.  However I think the rest
1061          * of the conditions would prevent that from causing any
1062          * unnecessary retransmission. 
1063          *   Clearly if the first packet has expired it should be 
1064          * retransmitted.  The other alternative, "flag&2 && retransmits", is
1065          * harder to explain:  You have to look carefully at how and when the
1066          * timer is set and with what timeout.  The most recent transmission always
1067          * sets the timer.  So in general if the most recent thing has timed
1068          * out, everything before it has as well.  So we want to go ahead and
1069          * retransmit some more.  If we didn't explicitly test for this
1070          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1071          * would not be true.  If you look at the pattern of timing, you can
1072          * show that rto is increased fast enough that the next packet would
1073          * almost never be retransmitted immediately.  Then you'd end up
1074          * waiting for a timeout to send each packet on the retransmission
1075          * queue.  With my implementation of the Karn sampling algorithm,
1076          * the timeout would double each time.  The net result is that it would
1077          * take a hideous amount of time to recover from a single dropped packet.
1078          * It's possible that there should also be a test for TIME_WRITE, but
1079          * I think as long as "send_head != NULL" and "retransmit" is on, we've
1080          * got to be in real retransmission mode.
1081          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
1082          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1083          * As long as no further losses occur, this seems reasonable.
1084          */
1085         
1086         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1087                (((flag&2) && sk->retransmits) ||
1088                (sk->send_head->when + sk->rto < jiffies)))
1089         {
1090                 if(sk->send_head->when + sk->rto < jiffies)
1091                         tcp_retransmit(sk,0);   
1092                 else
1093                 {
1094                         tcp_do_retransmit(sk, 1);
1095                         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1096                 }
1097         }
1098 
1099         return 1;
1100 
1101 uninteresting_ack:
1102         if(sk->debug)
1103                 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1104                         
1105         /*
1106          *      Keepalive processing.
1107          */
1108                  
1109         if (after(ack, sk->sent_seq)) 
1110         {
1111                 return 0;
1112         }
1113                 
1114         /*
1115          *      Restart the keepalive timer.
1116          */
1117                  
1118         if (sk->keepopen) 
1119         {
1120                 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1121                         tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1122         }
1123         return 1;
1124 }
1125 
1126 
1127 /*
1128  *      Process the FIN bit. This now behaves as it is supposed to work
1129  *      and the FIN takes effect when it is validly part of sequence
1130  *      space. Not before when we get holes.
1131  *
1132  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1133  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1134  *      TIME-WAIT)
1135  *
1136  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1137  *      close and we go into CLOSING (and later onto TIME-WAIT)
1138  *
1139  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1140  *
1141  */
1142  
1143 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
1144 {
1145         sk->fin_seq = skb->end_seq;
1146 
1147         if (!sk->dead) 
1148         {
1149                 sk->state_change(sk);
1150                 sock_wake_async(sk->socket, 1);
1151         }
1152 
1153         switch(sk->state) 
1154         {
1155                 case TCP_SYN_RECV:
1156                 case TCP_SYN_SENT:
1157                 case TCP_ESTABLISHED:
1158                         /*
1159                          * move to CLOSE_WAIT, tcp_data() already handled
1160                          * sending the ack.
1161                          */
1162                         tcp_set_state(sk,TCP_CLOSE_WAIT);
1163                         if (th->rst)
1164                                 sk->shutdown = SHUTDOWN_MASK;
1165                         break;
1166 
1167                 case TCP_CLOSE_WAIT:
1168                 case TCP_CLOSING:
1169                         /*
1170                          * received a retransmission of the FIN, do
1171                          * nothing.
1172                          */
1173                         break;
1174                 case TCP_TIME_WAIT:
1175                         /*
1176                          * received a retransmission of the FIN,
1177                          * restart the TIME_WAIT timer.
1178                          */
1179                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1180                         return(0);
1181                 case TCP_FIN_WAIT1:
1182                         /*
1183                          * This case occurs when a simultaneous close
1184                          * happens, we must ack the received FIN and
1185                          * enter the CLOSING state.
1186                          *
1187                          * This causes a WRITE timeout, which will either
1188                          * move on to TIME_WAIT when we timeout, or resend
1189                          * the FIN properly (maybe we get rid of that annoying
1190                          * FIN lost hang). The TIME_WRITE code is already correct
1191                          * for handling this timeout.
1192                          */
1193 
1194                         if(sk->ip_xmit_timeout != TIME_WRITE)
1195                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1196                         tcp_set_state(sk,TCP_CLOSING);
1197                         break;
1198                 case TCP_FIN_WAIT2:
1199                         /*
1200                          * received a FIN -- send ACK and enter TIME_WAIT
1201                          */
1202                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1203                         sk->shutdown|=SHUTDOWN_MASK;
1204                         tcp_set_state(sk,TCP_TIME_WAIT);
1205                         break;
1206                 case TCP_CLOSE:
1207                         /*
1208                          * already in CLOSE
1209                          */
1210                         break;
1211                 default:
1212                         tcp_set_state(sk,TCP_LAST_ACK);
1213         
1214                         /* Start the timers. */
1215                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1216                         return(0);
1217         }
1218 
1219         return(0);
1220 }
1221 
1222 /*
1223  * Add a sk_buff to the TCP receive queue, calculating
1224  * the ACK sequence as we go..
1225  */
1226 static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list)
     /*  */
1227 {
1228         struct sk_buff * prev, * next;
1229         u32 seq;
1230 
1231         /*
1232          * Find where the new skb goes.. (This goes backwards,
1233          * on the assumption that we get the packets in order)
1234          */
1235         seq = skb->seq;
1236         prev = list->prev;
1237         next = (struct sk_buff *) list;
1238         for (;;) {
1239                 if (prev == (struct sk_buff *) list || !after(prev->seq, seq))
1240                         break;
1241                 next = prev;
1242                 prev = prev->prev;
1243         }
1244         __skb_insert(skb, prev, next, list);
1245 }
1246 
1247 /*
1248  * Called for each packet when we find a new ACK endpoint sequence in it
1249  */
1250 static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
     /*  */
1251 {
1252         /*
1253          *      When we ack the fin, we do the FIN 
1254          *      processing.
1255          */
1256         skb->acked = 1;
1257         if (skb->h.th->fin)
1258                 tcp_fin(skb,sk,skb->h.th);
1259         return skb->end_seq;
1260 }       
1261 
1262 static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th)
     /*  */
1263 {
1264         u32 ack_seq;
1265 
1266         tcp_insert_skb(skb, &sk->receive_queue);
1267 
1268         /*
1269          * Did we get anything new to ack?
1270          */
1271         ack_seq = sk->acked_seq;
1272 
1273 
1274         if (!after(skb->seq, ack_seq)) {
1275                 if (after(skb->end_seq, ack_seq)) {
1276                         /* the packet straddles our window end */
1277                         struct sk_buff_head * list = &sk->receive_queue;
1278                         struct sk_buff * next;
1279                         ack_seq = tcp_queue_ack(skb, sk);
1280 
1281                         /*
1282                          * Do we have any old packets to ack that the above
1283                          * made visible? (Go forward from skb)
1284                          */
1285                         next = skb->next;
1286                         while (next != (struct sk_buff *) list) {
1287                                 if (after(next->seq, ack_seq))
1288                                         break;
1289                                 if (after(next->end_seq, ack_seq))
1290                                         ack_seq = tcp_queue_ack(next, sk);
1291                                 next = next->next;
1292                         }
1293 
1294                         /*
1295                          * Ok, we found new data, update acked_seq as
1296                          * necessary (and possibly send the actual
1297                          * ACK packet).
1298                          */
1299                         sk->acked_seq = ack_seq;
1300 
1301                 } else {
1302                         if (sk->debug)
1303                                 printk("Ack duplicate packet.\n");
1304                         tcp_send_ack(sk);
1305                         return;
1306                 }
1307 
1308 
1309                 /*
1310                  * Delay the ack if possible.  Send ack's to
1311                  * fin frames immediately as there shouldn't be
1312                  * anything more to come.
1313                  */
1314                 if (!sk->delay_acks || th->fin) {
1315                         tcp_send_ack(sk);
1316                 } else {
1317                         /*
1318                          * If psh is set we assume it's an
1319                          * interactive session that wants quick
1320                          * acks to avoid nagling too much. 
1321                          */
1322                         int delay = HZ/2;
1323                         if (th->psh)
1324                                 delay = HZ/50;
1325                         tcp_send_delayed_ack(sk, delay);
1326                 }
1327 
1328                 /*
1329                  *      Tell the user we have some more data.
1330                  */
1331 
1332                 if (!sk->dead)
1333                         sk->data_ready(sk,0);
1334 
1335         }
1336         else
1337         {
1338             /*
1339              *  If we've missed a packet, send an ack.
1340              *  Also start a timer to send another.
1341              *
1342              *  4.3reno machines look for these kind of acks so
1343              *  they can do fast recovery. Three identical 'old'
1344              *  acks lets it know that one frame has been lost
1345              *      and should be resent. Because this is before the
1346              *  whole window of data has timed out it can take
1347              *  one lost frame per window without stalling.
1348              *  [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
1349              *
1350              *  We also should be spotting triple bad sequences.
1351              *  [We now do this.]
1352              *
1353              */
1354              
1355             if (!skb->acked) 
1356             {
1357                     if(sk->debug)
1358                             printk("Ack past end of seq packet.\n");
1359                     tcp_send_ack(sk);
1360                     tcp_send_delayed_ack(sk,HZ/2);
1361             }
1362         }
1363 }
1364 
1365 
1366 /*
1367  *      This routine handles the data.  If there is room in the buffer,
1368  *      it will be have already been moved into it.  If there is no
1369  *      room, then we will just have to discard the packet.
1370  */
1371 
1372 static int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
1373          unsigned long saddr, unsigned int len)
1374 {
1375         struct tcphdr *th;
1376         u32 new_seq, shut_seq;
1377 
1378         th = skb->h.th;
1379         skb_pull(skb,th->doff*4);
1380         skb_trim(skb,len-(th->doff*4));
1381 
1382         /*
1383          *      The bytes in the receive read/assembly queue has increased. Needed for the
1384          *      low memory discard algorithm 
1385          */
1386            
1387         sk->bytes_rcv += skb->len;
1388         
1389         if (skb->len == 0 && !th->fin) 
1390         {
1391                 /* 
1392                  *      Don't want to keep passing ack's back and forth. 
1393                  *      (someone sent us dataless, boring frame)
1394                  */
1395                 if (!th->ack)
1396                         tcp_send_ack(sk);
1397                 kfree_skb(skb, FREE_READ);
1398                 return(0);
1399         }
1400         
1401         /*
1402          *      We no longer have anyone receiving data on this connection.
1403          */
1404 
1405 #ifndef TCP_DONT_RST_SHUTDOWN            
1406 
1407         if(sk->shutdown & RCV_SHUTDOWN)
1408         {
1409                 /*
1410                  *      FIXME: BSD has some magic to avoid sending resets to
1411                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
1412                  *      BSD stacks still have broken keepalives so we want to
1413                  *      cope with it.
1414                  */
1415 
1416                 if(skb->len)    /* We don't care if it's just an ack or
1417                                    a keepalive/window probe */
1418                 {
1419                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
1420                         
1421                         /* Do this the way 4.4BSD treats it. Not what I'd
1422                            regard as the meaning of the spec but it's what BSD
1423                            does and clearly they know everything 8) */
1424 
1425                         /*
1426                          *      This is valid because of two things
1427                          *
1428                          *      a) The way tcp_data behaves at the bottom.
1429                          *      b) A fin takes effect when read not when received.
1430                          */
1431                          
1432                         shut_seq = sk->acked_seq+1;     /* Last byte */
1433                         
1434                         if(after(new_seq,shut_seq))
1435                         {
1436                                 if(sk->debug)
1437                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1438                                                 sk, new_seq, shut_seq, sk->blog);
1439                                 if(sk->dead)
1440                                 {
1441                                         sk->acked_seq = new_seq + th->fin;
1442                                         tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1443                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1444                                         tcp_statistics.TcpEstabResets++;
1445                                         sk->err = EPIPE;
1446                                         sk->error_report(sk);
1447                                         sk->shutdown = SHUTDOWN_MASK;
1448                                         tcp_set_state(sk,TCP_CLOSE);
1449                                         kfree_skb(skb, FREE_READ);
1450                                         return 0;
1451                                 }
1452                         }
1453                 }
1454         }
1455 
1456 #endif
1457 
1458         tcp_queue(skb, sk, th);
1459 
1460         return(0);
1461 }
1462 
1463 
1464 /*
1465  *      This routine is only called when we have urgent data
1466  *      signalled. Its the 'slow' part of tcp_urg. It could be
1467  *      moved inline now as tcp_urg is only called from one
1468  *      place. We handle URGent data wrong. We have to - as
1469  *      BSD still doesn't use the correction from RFC961.
1470  */
1471  
1472 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
1473 {
1474         u32 ptr = ntohs(th->urg_ptr);
1475 
1476         if (ptr)
1477                 ptr--;
1478         ptr += ntohl(th->seq);
1479 
1480         /* ignore urgent data that we've already seen and read */
1481         if (after(sk->copied_seq, ptr))
1482                 return;
1483 
1484         /* do we already have a newer (or duplicate) urgent pointer? */
1485         if (sk->urg_data && !after(ptr, sk->urg_seq))
1486                 return;
1487 
1488         /* tell the world about our new urgent pointer */
1489         if (sk->proc != 0) {
1490                 if (sk->proc > 0) {
1491                         kill_proc(sk->proc, SIGURG, 1);
1492                 } else {
1493                         kill_pg(-sk->proc, SIGURG, 1);
1494                 }
1495         }
1496         sk->urg_data = URG_NOTYET;
1497         sk->urg_seq = ptr;
1498 }
1499 
1500 /*
1501  *      This is the 'fast' part of urgent handling.
1502  */
1503  
1504 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
     /*  */
1505 {
1506         /*
1507          *      Check if we get a new urgent pointer - normally not 
1508          */
1509          
1510         if (th->urg)
1511                 tcp_check_urg(sk,th);
1512 
1513         /*
1514          *      Do we wait for any urgent data? - normally not
1515          */
1516          
1517         if (sk->urg_data == URG_NOTYET) {
1518                 u32 ptr;
1519 
1520                 /*
1521                  *      Is the urgent pointer pointing into this packet? 
1522                  */      
1523                 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1524                 if (ptr < len) {
1525                         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1526                         if (!sk->dead)
1527                                 sk->data_ready(sk,0);
1528                 }
1529         }
1530 }
1531 
1532 /*
1533  * This should be a bit smarter and remove partially
1534  * overlapping stuff too, but this should be good
1535  * enough for any even remotely normal case (and the
1536  * worst that can happen is that we have a few
1537  * unnecessary packets in the receive queue).
1538  *
1539  * This function is never called with an empty list..
1540  */
1541 static inline void tcp_remove_dups(struct sk_buff_head * list)
     /*  */
1542 {
1543         struct sk_buff * next = list->next;
1544 
1545         for (;;) {
1546                 struct sk_buff * skb = next;
1547                 next = next->next;
1548                 if (next == (struct sk_buff *) list)
1549                         break;
1550                 if (before(next->end_seq, skb->end_seq)) {
1551                         __skb_unlink(next, list);
1552                         kfree_skb(next, FREE_READ);
1553                         next = skb;
1554                         continue;
1555                 }
1556                 if (next->seq != skb->seq)
1557                         continue;
1558                 __skb_unlink(skb, list);
1559                 kfree_skb(skb, FREE_READ);
1560         }
1561 }
1562 
1563 /*
1564  * Throw out all unnecessary packets: we've gone over the
1565  * receive queue limit. This shouldn't happen in a normal
1566  * TCP connection, but we might have gotten duplicates etc.
1567  */
1568 static void prune_queue(struct sk_buff_head * list)
     /*  */
1569 {
1570         for (;;) {
1571                 struct sk_buff * skb = list->prev;
1572 
1573                 /* gone through it all? */
1574                 if (skb == (struct sk_buff *) list)
1575                         break;
1576                 if (!skb->acked) {
1577                         __skb_unlink(skb, list);
1578                         kfree_skb(skb, FREE_READ);
1579                         continue;
1580                 }
1581                 tcp_remove_dups(list);
1582                 break;
1583         }
1584 }
1585 
1586 /*
1587  *      A TCP packet has arrived.
1588  *              skb->h.raw is the TCP header.
1589  */
1590  
1591 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
1592         __u32 daddr, unsigned short len,
1593         __u32 saddr, int redo, struct inet_protocol * protocol)
1594 {
1595         struct tcphdr *th;
1596         struct sock *sk;
1597         int syn_ok=0;
1598 
1599         /*
1600          * "redo" is 1 if we have already seen this skb but couldn't
1601          * use it at that time (the socket was locked).  In that case
1602          * we have already done a lot of the work (looked up the socket
1603          * etc).
1604          */
1605         th = skb->h.th;
1606         sk = skb->sk;
1607         if (!redo) {
1608                 tcp_statistics.TcpInSegs++;
1609                 if (skb->pkt_type!=PACKET_HOST)
1610                         goto discard_it;
1611 
1612                 /*
1613                  *      Pull up the IP header.
1614                  */
1615         
1616                 skb_pull(skb, skb->h.raw-skb->data);
1617 
1618                 /*
1619                  *      Try to use the device checksum if provided.
1620                  */
1621                 switch (skb->ip_summed) 
1622                 {
1623                         case CHECKSUM_NONE:
1624                                 skb->csum = csum_partial((char *)th, len, 0);
1625                         case CHECKSUM_HW:
1626                                 if (tcp_check(th, len, saddr, daddr, skb->csum))
1627                                         goto discard_it;
1628                         default:
1629                                 /* CHECKSUM_UNNECESSARY */
1630                 }
1631                 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1632                 if (!sk)
1633                         goto no_tcp_socket;
1634                 skb->sk = sk;
1635                 skb->seq = ntohl(th->seq);
1636                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1637                 skb->ack_seq = ntohl(th->ack_seq);
1638 
1639                 skb->acked = 0;
1640                 skb->used = 0;
1641                 skb->free = 1;
1642                 skb->saddr = daddr;
1643                 skb->daddr = saddr;
1644 
1645                 /*
1646                  * We may need to add it to the backlog here. 
1647                  */
1648                 if (sk->users) 
1649                 {
1650                         __skb_queue_tail(&sk->back_log, skb);
1651                         return(0);
1652                 }
1653         }
1654 
1655         /*
1656          *      If this socket has got a reset it's to all intents and purposes 
1657          *      really dead. Count closed sockets as dead.
1658          *
1659          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1660          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
1661          *      exist so should cause resets as if the port was unreachable.
1662          */
1663 
1664         if (sk->zapped || sk->state==TCP_CLOSE)
1665                 goto no_tcp_socket;
1666 
1667         if (!sk->prot) 
1668         {
1669                 printk("IMPOSSIBLE 3\n");
1670                 return(0);
1671         }
1672 
1673 
1674         /*
1675          *      Charge the memory to the socket. 
1676          */
1677          
1678         skb->sk=sk;
1679         atomic_add(skb->truesize, &sk->rmem_alloc);
1680         
1681         /*
1682          *      We should now do header prediction.
1683          */
1684          
1685         /*
1686          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1687          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1688          *      compatibility. We also set up variables more thoroughly [Karn notes in the
1689          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1690          */
1691 
1692         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
1693         {
1694         
1695                 /*
1696                  *      Now deal with unusual cases.
1697                  */
1698          
1699                 if(sk->state==TCP_LISTEN)
1700                 {
1701                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
1702                                 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1703 
1704                         /*
1705                          *      We don't care for RST, and non SYN are absorbed (old segments)
1706                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1707                          *      netmask on a running connection it can go broadcast. Even Sun's have
1708                          *      this problem so I'm ignoring it 
1709                          */
1710                            
1711                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1712                         {
1713                                 kfree_skb(skb, FREE_READ);
1714                                 return 0;
1715                         }
1716                 
1717                         /*      
1718                          *      Guess we need to make a new socket up 
1719                          */
1720                 
1721                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1722                 
1723                         /*
1724                          *      Now we have several options: In theory there is nothing else
1725                          *      in the frame. KA9Q has an option to send data with the syn,
1726                          *      BSD accepts data with the syn up to the [to be] advertised window
1727                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
1728                          *      it, that fits the spec precisely and avoids incompatibilities. It
1729                          *      would be nice in future to drop through and process the data.
1730                          *
1731                          *      Now TTCP is starting to use we ought to queue this data.
1732                          */
1733                          
1734                         return 0;
1735                 }
1736         
1737                 /* 
1738                  *      Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1739                  *      then its a new connection
1740                  */
1741                  
1742                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1743                 {
1744                         kfree_skb(skb, FREE_READ);
1745                         return 0;
1746                 }
1747                 
1748                 /*
1749                  *      SYN sent means we have to look for a suitable ack and either reset
1750                  *      for bad matches or go to connected. The SYN_SENT case is unusual and should
1751                  *      not be in line code. [AC]
1752                  */
1753            
1754                 if(sk->state==TCP_SYN_SENT)
1755                 {
1756                         /* Crossed SYN or previous junk segment */
1757                         if(th->ack)
1758                         {
1759                                 /* We got an ack, but it's not a good ack */
1760                                 if(!tcp_ack(sk,th,skb->ack_seq,len))
1761                                 {
1762                                         /* Reset the ack - its an ack from a 
1763                                            different connection  [ th->rst is checked in tcp_send_reset()] */
1764                                         tcp_statistics.TcpAttemptFails++;
1765                                         tcp_send_reset(daddr, saddr, th,
1766                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1767                                         kfree_skb(skb, FREE_READ);
1768                                         return(0);
1769                                 }
1770                                 if(th->rst)
1771                                         return tcp_reset(sk,skb);
1772                                 if(!th->syn)
1773                                 {
1774                                         /* A valid ack from a different connection
1775                                            start. Shouldn't happen but cover it */
1776                                         tcp_statistics.TcpAttemptFails++;
1777                                         tcp_send_reset(daddr, saddr, th,
1778                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1779                                         kfree_skb(skb, FREE_READ);
1780                                         return 0;
1781                                 }
1782                                 /*
1783                                  *      Ok.. it's good. Set up sequence numbers and
1784                                  *      move to established.
1785                                  */
1786                                 syn_ok=1;       /* Don't reset this connection for the syn */
1787                                 sk->acked_seq = skb->seq+1;
1788                                 sk->lastwin_seq = skb->seq+1;
1789                                 sk->fin_seq = skb->seq;
1790                                 tcp_send_ack(sk);
1791                                 tcp_set_state(sk, TCP_ESTABLISHED);
1792                                 tcp_options(sk,th);
1793                                 sk->dummy_th.dest=th->source;
1794                                 sk->copied_seq = sk->acked_seq;
1795                                 if(!sk->dead)
1796                                 {
1797                                         sk->state_change(sk);
1798                                         sock_wake_async(sk->socket, 0);
1799                                 }
1800                                 if(sk->max_window==0)
1801                                 {
1802                                         sk->max_window = 32;
1803                                         sk->mss = min(sk->max_window, sk->mtu);
1804                                 }
1805                         }
1806                         else
1807                         {
1808                                 /* See if SYN's cross. Drop if boring */
1809                                 if(th->syn && !th->rst)
1810                                 {
1811                                         /* Crossed SYN's are fine - but talking to
1812                                            yourself is right out... */
1813                                         if(sk->saddr==saddr && sk->daddr==daddr &&
1814                                                 sk->dummy_th.source==th->source &&
1815                                                 sk->dummy_th.dest==th->dest)
1816                                         {
1817                                                 tcp_statistics.TcpAttemptFails++;
1818                                                 return tcp_reset(sk,skb);
1819                                         }
1820                                         tcp_set_state(sk,TCP_SYN_RECV);
1821                                         
1822                                         /*
1823                                          *      FIXME:
1824                                          *      Must send SYN|ACK here
1825                                          */
1826                                 }               
1827                                 /* Discard junk segment */
1828                                 kfree_skb(skb, FREE_READ);
1829                                 return 0;
1830                         }
1831                         /*
1832                          *      SYN_RECV with data maybe.. drop through
1833                          */
1834                         goto rfc_step6;
1835                 }
1836 
1837         /*
1838          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1839          *      a more complex suggestion for fixing these reuse issues in RFC1644
1840          *      but not yet ready for general use. Also see RFC1379.
1841          *
1842          *      Note the funny way we go back to the top of this function for
1843          *      this case ("goto try_next_socket").  That also takes care of
1844          *      checking "sk->users" for the new socket as well as doing all
1845          *      the normal tests on the packet.
1846          */
1847         
1848 #define BSD_TIME_WAIT
1849 #ifdef BSD_TIME_WAIT
1850                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
1851                         after(skb->seq, sk->acked_seq) && !th->rst)
1852                 {
1853                         u32 seq = sk->write_seq;
1854                         if(sk->debug)
1855                                 printk("Doing a BSD time wait\n");
1856                         tcp_statistics.TcpEstabResets++;           
1857                         atomic_sub(skb->truesize, &sk->rmem_alloc);
1858                         skb->sk = NULL;
1859                         sk->err=ECONNRESET;
1860                         tcp_set_state(sk, TCP_CLOSE);
1861                         sk->shutdown = SHUTDOWN_MASK;
1862                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1863                         /* this is not really correct: we should check sk->users */
1864                         if (sk && sk->state==TCP_LISTEN)
1865                         {
1866                                 skb->sk = sk;
1867                                 atomic_add(skb->truesize, &sk->rmem_alloc);
1868                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1869                                 return 0;
1870                         }
1871                         kfree_skb(skb, FREE_READ);
1872                         return 0;
1873                 }
1874 #endif  
1875         }
1876 
1877         /*
1878          *      We are now in normal data flow (see the step list in the RFC)
1879          *      Note most of these are inline now. I'll inline the lot when
1880          *      I have time to test it hard and look at what gcc outputs 
1881          */
1882         
1883         if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1884         {
1885                 bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
1886                 kfree_skb(skb, FREE_READ);
1887                 return 0;
1888         }
1889 
1890         if(th->rst)
1891                 return tcp_reset(sk,skb);
1892         
1893         /*
1894          *      !syn_ok is effectively the state test in RFC793.
1895          */
1896          
1897         if(th->syn && !syn_ok)
1898         {
1899                 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1900                 return tcp_reset(sk,skb);       
1901         }
1902 
1903         tcp_delack_estimator(sk);
1904         
1905         /*
1906          *      Process the ACK
1907          */
1908          
1909 
1910         if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1911         {
1912                 /*
1913                  *      Our three way handshake failed.
1914                  */
1915                  
1916                 if(sk->state==TCP_SYN_RECV)
1917                 {
1918                         tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1919                 }
1920                 kfree_skb(skb, FREE_READ);
1921                 return 0;
1922         }
1923         
1924 rfc_step6:              /* I'll clean this up later */
1925 
1926         /*
1927          *      If the accepted buffer put us over our queue size we
1928          *      now drop it (we must process the ack first to avoid
1929          *      deadlock cases).
1930          */
1931 
1932         /*
1933          *      Process urgent data
1934          */
1935                 
1936         tcp_urg(sk, th, len);
1937         
1938         /*
1939          *      Process the encapsulated data
1940          */
1941         
1942         if(tcp_data(skb,sk, saddr, len))
1943                 kfree_skb(skb, FREE_READ);
1944 
1945         /*
1946          *      If our receive queue has grown past its limits,
1947          *      try to prune away duplicates etc..
1948          */
1949         if (sk->rmem_alloc > sk->rcvbuf)
1950                 prune_queue(&sk->receive_queue);
1951 
1952         /*
1953          *      And done
1954          */     
1955         
1956         return 0;
1957 
1958 no_tcp_socket:
1959         /*
1960          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1961          */
1962         tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1963 
1964 discard_it:
1965         /*
1966          *      Discard frame
1967          */
1968         skb->sk = NULL;
1969         kfree_skb(skb, FREE_READ);
1970         return 0;
1971 }
/* */
root/net/ipv4/tcp_input.c

DEFINITIONS