net/ipv4/tcp

/* */
This source file includes following definitions.
tcp_delack_estimator
tcp_rtt_estimator
tcp_cache_zap
get_tcp_sock
bad_tcp_sequence
tcp_sequence
tcp_reset
tcp_options
tcp_conn_request
tcp_window_shrunk
tcp_ack
tcp_fin
tcp_insert_skb
tcp_queue_ack
tcp_queue
tcp_data
tcp_check_urg
tcp_urg
tcp_remove_dups
prune_queue
tcp_rcv
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp_input.c 1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * FIXES
  23  *              Pedro Roque     :       Double ACK bug
  24  */
  25 
  26 #include <linux/config.h>
  27 #include <net/tcp.h>
  28 
  29 /*
  30  *      Policy code extracted so its now separate
  31  */
  32 
  33 /*
  34  *      Called each time to estimate the delayed ack timeout. This is
  35  *      how it should be done so a fast link isn't impacted by ack delay.
  36  */
  37  
  38 extern __inline__ void tcp_delack_estimator(struct sock *sk)
     /*  */
  39 {
  40         /*
  41          *      Delayed ACK time estimator.
  42          */
  43         
  44         if (sk->lrcvtime == 0) 
  45         {
  46                 sk->lrcvtime = jiffies;
  47                 sk->ato = HZ/3;
  48         }
  49         else 
  50         {
  51                 int m;
  52                 
  53                 m = jiffies - sk->lrcvtime;
  54 
  55                 sk->lrcvtime = jiffies;
  56 
  57                 if (m <= 0)
  58                         m = 1;
  59 
  60                 if (m > (sk->rtt >> 3)) 
  61                 {
  62                         sk->ato = sk->rtt >> 3;
  63                         /*
  64                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
  65                          */
  66                 }
  67                 else 
  68                 {
  69                         sk->ato = (sk->ato >> 1) + m;
  70                         /*
  71                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
  72                          */
  73                 }
  74         }
  75 }
  76 
  77 /*
  78  *      Called on frames that were known _not_ to have been
  79  *      retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
  80  *      The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
  81  */
  82  
  83 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
     /*  */
  84 {
  85         long m;
  86         /*
  87          *      The following amusing code comes from Jacobson's
  88          *      article in SIGCOMM '88.  Note that rtt and mdev
  89          *      are scaled versions of rtt and mean deviation.
  90          *      This is designed to be as fast as possible 
  91          *      m stands for "measurement".
  92          */
  93         
  94         m = jiffies - oskb->when;  /* RTT */
  95         if(m<=0)
  96                 m=1;            /* IS THIS RIGHT FOR <0 ??? */
  97         m -= (sk->rtt >> 3);    /* m is now error in rtt est */
  98         sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
  99         if (m < 0)
 100                 m = -m;         /* m is now abs(error) */
 101         m -= (sk->mdev >> 2);   /* similar update on mdev */
 102         sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 103 
 104         /*
 105          *      Now update timeout.  Note that this removes any backoff.
 106          */
 107                          
 108         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 109         if (sk->rto > 120*HZ)
 110                 sk->rto = 120*HZ;
 111         if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
 112                 sk->rto = HZ/5;
 113         sk->backoff = 0;
 114 }
 115 
 116 /*
 117  *      Cached last hit socket
 118  */
 119  
 120 static volatile unsigned long   th_cache_saddr, th_cache_daddr;
 121 static volatile unsigned short  th_cache_dport, th_cache_sport;
 122 static volatile struct sock *th_cache_sk;
 123 
 124 void tcp_cache_zap(void)
     /*  */
 125 {
 126         th_cache_sk=NULL;
 127 }
 128 
 129 /*
 130  *      Find the socket, using the last hit cache if applicable. The cache is not quite
 131  *      right...
 132  */
 133 
 134 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
     /*  */
 135 {
 136         struct sock * sk;
 137 
 138         sk = (struct sock *) th_cache_sk;
 139         if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
 140             sport != th_cache_sport || dport != th_cache_dport) {
 141                 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
 142                 if (sk) {
 143                         th_cache_saddr=saddr;
 144                         th_cache_daddr=daddr;
 145                         th_cache_dport=dport;
 146                         th_cache_sport=sport;
 147                         th_cache_sk=sk;
 148                 }
 149         }
 150         return sk;
 151 }
 152 
 153 /*
 154  * React to a out-of-window TCP sequence number in an incoming packet
 155  */
 156  
 157 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, u32 end_seq,
     /*  */
 158               struct device *dev)
 159 {
 160         if (th->rst)
 161                 return;
 162 
 163         /*
 164          *      Send a reset if we get something not ours and we are
 165          *      unsynchronized. Note: We don't do anything to our end. We
 166          *      are just killing the bogus remote connection then we will
 167          *      connect again and it will work (with luck).
 168          */
 169          
 170         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
 171         {
 172                 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
 173                 return;
 174         }
 175 
 176         /*
 177          *      4.3reno machines look for these kind of acks so they can do fast
 178          *      recovery. Three identical 'old' acks lets it know that one frame has
 179          *      been lost and should be resent. Because this is before the whole window
 180          *      of data has timed out it can take one lost frame per window without
 181          *      stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
 182          */
 183         tcp_send_ack(sk);
 184 }
 185 
 186 /*
 187  *      This functions checks to see if the tcp header is actually acceptable. 
 188  */
 189  
 190 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
     /*  */
 191 {
 192         u32 end_window = sk->acked_seq + sk->window;
 193         return  /* if start is at end of window, end must be too (zero window) */
 194                 (seq == end_window && seq == end_seq) ||
 195                 /* if start is before end of window, check for interest */
 196                 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
 197 }
 198 
 199 /*
 200  *      When we get a reset we do this. This probably is a tcp_output routine
 201  *      really.
 202  */
 203 
 204 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
 205 {
 206         sk->zapped = 1;
 207         /*
 208          *      We want the right error as BSD sees it (and indeed as we do).
 209          */
 210         sk->err = ECONNRESET;
 211         if (sk->state == TCP_SYN_SENT)
 212                 sk->err = ECONNREFUSED;
 213         if (sk->state == TCP_CLOSE_WAIT)
 214                 sk->err = EPIPE;
 215 #ifdef CONFIG_TCP_RFC1337
 216         /*
 217          *      Time wait assassination protection [RFC1337]
 218          *
 219          *      This is a good idea, but causes more sockets to take time to close.
 220          *
 221          *      Ian Heavens has since shown this is an inadequate fix for the protocol
 222          *      bug in question.
 223          */
 224         if(sk->state!=TCP_TIME_WAIT)
 225         {       
 226                 tcp_set_state(sk,TCP_CLOSE);
 227                 sk->shutdown = SHUTDOWN_MASK;
 228         }
 229 #else   
 230         tcp_set_state(sk,TCP_CLOSE);
 231         sk->shutdown = SHUTDOWN_MASK;
 232 #endif  
 233         if (!sk->dead) 
 234                 sk->state_change(sk);
 235         kfree_skb(skb, FREE_READ);
 236         return(0);
 237 }
 238 
 239 
 240 /*
 241  *      Look for tcp options. Parses everything but only knows about MSS.
 242  *      This routine is always called with the packet containing the SYN.
 243  *      However it may also be called with the ack to the SYN.  So you
 244  *      can't assume this is always the SYN.  It's always called after
 245  *      we have set up sk->mtu to our own MTU.
 246  *
 247  *      We need at minimum to add PAWS support here. Possibly large windows
 248  *      as Linux gets deployed on 100Mb/sec networks.
 249  */
 250  
 251 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
 252 {
 253         unsigned char *ptr;
 254         int length=(th->doff*4)-sizeof(struct tcphdr);
 255         int mss_seen = 0;
 256     
 257         ptr = (unsigned char *)(th + 1);
 258   
 259         while(length>0)
 260         {
 261                 int opcode=*ptr++;
 262                 int opsize=*ptr++;
 263                 switch(opcode)
 264                 {
 265                         case TCPOPT_EOL:
 266                                 return;
 267                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 268                                 length--;
 269                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
 270                                 continue;
 271                         
 272                         default:
 273                                 if(opsize<=2)   /* Avoid silly options looping forever */
 274                                         return;
 275                                 switch(opcode)
 276                                 {
 277                                         case TCPOPT_MSS:
 278                                                 if(opsize==4 && th->syn)
 279                                                 {
 280                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
 281                                                         mss_seen = 1;
 282                                                 }
 283                                                 break;
 284                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
 285                                 }
 286                                 ptr+=opsize-2;
 287                                 length-=opsize;
 288                 }
 289         }
 290         if (th->syn) 
 291         {
 292                 if (! mss_seen)
 293                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
 294         }
 295 #ifdef CONFIG_INET_PCTCP
 296         sk->mss = min(sk->max_window >> 1, sk->mtu);
 297 #else    
 298         sk->mss = min(sk->max_window, sk->mtu);
 299         sk->max_unacked = 2 * sk->mss;
 300 #endif  
 301 }
 302 
 303 
 304 /*
 305  *      This routine handles a connection request.
 306  *      It should make sure we haven't already responded.
 307  *      Because of the way BSD works, we have to send a syn/ack now.
 308  *      This also means it will be harder to close a socket which is
 309  *      listening.
 310  */
 311  
 312 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
 313                  u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
 314 {
 315         struct sock *newsk;
 316         struct tcphdr *th;
 317         struct rtable *rt;
 318   
 319         th = skb->h.th;
 320 
 321         /* If the socket is dead, don't accept the connection. */
 322         if (!sk->dead) 
 323         {
 324                 sk->data_ready(sk,0);
 325         }
 326         else 
 327         {
 328                 if(sk->debug)
 329                         printk("Reset on %p: Connect on dead socket.\n",sk);
 330                 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
 331                 tcp_statistics.TcpAttemptFails++;
 332                 kfree_skb(skb, FREE_READ);
 333                 return;
 334         }
 335 
 336         /*
 337          *      Make sure we can accept more.  This will prevent a
 338          *      flurry of syns from eating up all our memory.
 339          *
 340          *      BSD does some funnies here and allows 3/2 times the
 341          *      set backlog as a fudge factor. Thats just too gross.
 342          */
 343 
 344         if (sk->ack_backlog >= sk->max_ack_backlog) 
 345         {
 346                 tcp_statistics.TcpAttemptFails++;
 347                 kfree_skb(skb, FREE_READ);
 348                 return;
 349         }
 350 
 351         /*
 352          * We need to build a new sock struct.
 353          * It is sort of bad to have a socket without an inode attached
 354          * to it, but the wake_up's will just wake up the listening socket,
 355          * and if the listening socket is destroyed before this is taken
 356          * off of the queue, this will take care of it.
 357          */
 358 
 359         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
 360         if (newsk == NULL) 
 361         {
 362                 /* just ignore the syn.  It will get retransmitted. */
 363                 tcp_statistics.TcpAttemptFails++;
 364                 kfree_skb(skb, FREE_READ);
 365                 return;
 366         }
 367 
 368         memcpy(newsk, sk, sizeof(*newsk));
 369         newsk->opt = NULL;
 370         newsk->ip_route_cache  = NULL;
 371         if (opt && opt->optlen) 
 372         {
 373                 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
 374                 if (!sk->opt) 
 375                 {
 376                         kfree_s(newsk, sizeof(struct sock));
 377                         tcp_statistics.TcpAttemptFails++;
 378                         kfree_skb(skb, FREE_READ);
 379                         return;
 380                 }
 381                 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
 382                 {
 383                         kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
 384                         kfree_s(newsk, sizeof(struct sock));
 385                         tcp_statistics.TcpAttemptFails++;
 386                         kfree_skb(skb, FREE_READ);
 387                         return;
 388                 }
 389         }
 390         skb_queue_head_init(&newsk->write_queue);
 391         skb_queue_head_init(&newsk->receive_queue);
 392         newsk->send_head = NULL;
 393         newsk->send_tail = NULL;
 394         skb_queue_head_init(&newsk->back_log);
 395         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
 396         newsk->rto = TCP_TIMEOUT_INIT;
 397         newsk->mdev = 0;
 398         newsk->max_window = 0;
 399         newsk->cong_window = 1;
 400         newsk->cong_count = 0;
 401         newsk->ssthresh = 0;
 402         newsk->backoff = 0;
 403         newsk->blog = 0;
 404         newsk->intr = 0;
 405         newsk->proc = 0;
 406         newsk->done = 0;
 407         newsk->partial = NULL;
 408         newsk->pair = NULL;
 409         newsk->wmem_alloc = 0;
 410         newsk->rmem_alloc = 0;
 411         newsk->localroute = sk->localroute;
 412 
 413         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
 414 
 415         newsk->err = 0;
 416         newsk->shutdown = 0;
 417         newsk->ack_backlog = 0;
 418         newsk->acked_seq = skb->seq+1;
 419         newsk->lastwin_seq = skb->seq+1;
 420         newsk->delay_acks = 1;
 421         newsk->copied_seq = skb->seq+1;
 422         newsk->fin_seq = skb->seq;
 423         newsk->state = TCP_SYN_RECV;
 424         newsk->timeout = 0;
 425         newsk->ip_xmit_timeout = 0;
 426         newsk->write_seq = seq; 
 427         newsk->window_seq = newsk->write_seq;
 428         newsk->rcv_ack_seq = newsk->write_seq;
 429         newsk->urg_data = 0;
 430         newsk->retransmits = 0;
 431         newsk->linger=0;
 432         newsk->destroy = 0;
 433         init_timer(&newsk->timer);
 434         newsk->timer.data = (unsigned long)newsk;
 435         newsk->timer.function = &net_timer;
 436         init_timer(&newsk->delack_timer);
 437         newsk->delack_timer.data = (unsigned long)newsk;
 438         newsk->delack_timer.function = tcp_delack_timer;
 439         init_timer(&newsk->retransmit_timer);
 440         newsk->retransmit_timer.data = (unsigned long)newsk;
 441         newsk->retransmit_timer.function = tcp_retransmit_timer;
 442         newsk->dummy_th.source = skb->h.th->dest;
 443         newsk->dummy_th.dest = skb->h.th->source;
 444         
 445         /*
 446          *      Swap these two, they are from our point of view. 
 447          */
 448          
 449         newsk->daddr = saddr;
 450         newsk->saddr = daddr;
 451         newsk->rcv_saddr = daddr;
 452 
 453         put_sock(newsk->num,newsk);
 454         newsk->acked_seq = skb->seq + 1;
 455         newsk->copied_seq = skb->seq + 1;
 456         newsk->socket = NULL;
 457 
 458         /*
 459          *      Grab the ttl and tos values and use them 
 460          */
 461 
 462         newsk->ip_ttl=sk->ip_ttl;
 463         newsk->ip_tos=skb->ip_hdr->tos;
 464 
 465         /*
 466          *      Use 512 or whatever user asked for 
 467          */
 468 
 469         /*
 470          *      Note use of sk->user_mss, since user has no direct access to newsk 
 471          */
 472 
 473         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
 474         newsk->ip_route_cache = rt;
 475         
 476         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
 477                 newsk->window_clamp = rt->rt_window;
 478         else
 479                 newsk->window_clamp = 0;
 480                 
 481         if (sk->user_mss)
 482                 newsk->mtu = sk->user_mss;
 483         else if (rt)
 484                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 485         else 
 486                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
 487 
 488         /*
 489          *      But not bigger than device MTU 
 490          */
 491 
 492         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
 493 
 494 #ifdef CONFIG_SKIP
 495         
 496         /*
 497          *      SKIP devices set their MTU to 65535. This is so they can take packets
 498          *      unfragmented to security process then fragment. They could lie to the
 499          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
 500          *      simply because the final package we want unfragmented is going to be
 501          *
 502          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
 503          */
 504          
 505         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
 506                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
 507 #endif
 508         /*
 509          *      This will min with what arrived in the packet 
 510          */
 511 
 512         tcp_options(newsk,skb->h.th);
 513         
 514         tcp_cache_zap();
 515         tcp_send_synack(newsk, sk, skb);
 516 }
 517 
 518 
 519 /*
 520  * Handle a TCP window that shrunk on us. It shouldn't happen,
 521  * but..
 522  *
 523  * We may need to move packets from the send queue
 524  * to the write queue, if the window has been shrunk on us.
 525  * The RFC says you are not allowed to shrink your window
 526  * like this, but if the other end does, you must be able
 527  * to deal with it.
 528  */
 529 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
     /*  */
 530 {
 531         struct sk_buff *skb;
 532         struct sk_buff *skb2;
 533         struct sk_buff *wskb = NULL;
 534         
 535         skb2 = sk->send_head;
 536         sk->send_head = NULL;
 537         sk->send_tail = NULL;
 538 
 539         /*
 540          *      This is an artifact of a flawed concept. We want one
 541          *      queue and a smarter send routine when we send all.
 542          */
 543         cli();
 544         while (skb2 != NULL) 
 545         {
 546                 skb = skb2;
 547                 skb2 = skb->link3;
 548                 skb->link3 = NULL;
 549                 if (after(skb->end_seq, window_seq)) 
 550                 {
 551                         if (sk->packets_out > 0) 
 552                                 sk->packets_out--;
 553                         /* We may need to remove this from the dev send list. */
 554                         if (skb->next != NULL) 
 555                         {
 556                                 skb_unlink(skb);                                
 557                         }
 558                         /* Now add it to the write_queue. */
 559                         if (wskb == NULL)
 560                                 skb_queue_head(&sk->write_queue,skb);
 561                         else
 562                                 skb_append(wskb,skb);
 563                         wskb = skb;
 564                 } 
 565                 else 
 566                 {
 567                         if (sk->send_head == NULL) 
 568                         {
 569                                 sk->send_head = skb;
 570                                 sk->send_tail = skb;
 571                         }
 572                         else
 573                         {
 574                                 sk->send_tail->link3 = skb;
 575                                 sk->send_tail = skb;
 576                         }
 577                         skb->link3 = NULL;
 578                 }
 579         }
 580         sti();
 581 }
 582 
 583 
 584 /*
 585  *      This routine deals with incoming acks, but not outgoing ones.
 586  *
 587  *      This routine is totally _WRONG_. The list structuring is wrong,
 588  *      the algorithm is wrong, the code is wrong.
 589  */
 590 
 591 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
     /*  */
 592 {
 593         int flag = 0;
 594         u32 window_seq;
 595 
 596         /* 
 597          * 1 - there was data in packet as well as ack or new data is sent or 
 598          *     in shutdown state
 599          * 2 - data from retransmit queue was acked and removed
 600          * 4 - window shrunk or data from retransmit queue was acked and removed
 601          * 8 - we want to do a fast retransmit. One packet only.
 602          */
 603 
 604         if(sk->zapped)
 605                 return(1);      /* Dead, cant ack any more so why bother */
 606 
 607         /*
 608          *      We have dropped back to keepalive timeouts. Thus we have
 609          *      no retransmits pending.
 610          */
 611          
 612         if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
 613                 sk->retransmits = 0;
 614 
 615         /*
 616          *      If the ack is newer than sent or older than previous acks
 617          *      then we can probably ignore it.
 618          */
 619          
 620         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
 621                 goto uninteresting_ack;
 622 
 623         /*
 624          *      If there is data set flag 1
 625          */
 626          
 627         if (len != th->doff*4) 
 628                 flag |= 1;
 629 
 630         /*
 631          *      Have we discovered a larger window
 632          */
 633         window_seq = ntohs(th->window);
 634         if (window_seq > sk->max_window) 
 635         {
 636                 sk->max_window = window_seq;
 637 #ifdef CONFIG_INET_PCTCP
 638                 /* Hack because we don't send partial packets to non SWS
 639                    handling hosts */
 640                 sk->mss = min(window_seq>>1, sk->mtu);
 641 #else
 642                 sk->mss = min(window_seq, sk->mtu);
 643 #endif  
 644         }
 645         window_seq += ack;
 646 
 647         /*
 648          *      See if our window has been shrunk. 
 649          */
 650         if (after(sk->window_seq, window_seq)) {
 651                 flag |= 4;
 652                 tcp_window_shrunk(sk, window_seq);
 653         }
 654 
 655         /*
 656          *      Pipe has emptied
 657          */      
 658         if (sk->send_tail == NULL || sk->send_head == NULL) 
 659         {
 660                 sk->send_head = NULL;
 661                 sk->send_tail = NULL;
 662                 sk->packets_out= 0;
 663         }
 664 
 665         /*
 666          *      We don't want too many packets out there. 
 667          */
 668          
 669         if (sk->ip_xmit_timeout == TIME_WRITE && 
 670                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
 671         {
 672                 
 673                 /* 
 674                  * This is Jacobson's slow start and congestion avoidance. 
 675                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
 676                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
 677                  * counter and increment it once every cwnd times.  It's possible
 678                  * that this should be done only if sk->retransmits == 0.  I'm
 679                  * interpreting "new data is acked" as including data that has
 680                  * been retransmitted but is just now being acked.
 681                  */
 682                 if (sk->cong_window < sk->ssthresh)  
 683                         /* 
 684                          *      In "safe" area, increase
 685                          */
 686                         sk->cong_window++;
 687                 else 
 688                 {
 689                         /*
 690                          *      In dangerous area, increase slowly.  In theory this is
 691                          *      sk->cong_window += 1 / sk->cong_window
 692                          */
 693                         if (sk->cong_count >= sk->cong_window) 
 694                         {
 695                                 sk->cong_window++;
 696                                 sk->cong_count = 0;
 697                         }
 698                         else 
 699                                 sk->cong_count++;
 700                 }
 701         }
 702 
 703         /*
 704          *      Remember the highest ack received and update the
 705          *      right hand window edge of the host.
 706          *      We do a bit of work here to track number of times we've
 707          *      seen this ack without a change in the right edge of the
 708          *      window and no data in the packet.
 709          *      This will allow us to do fast retransmits.
 710          */
 711 
 712         if (sk->rcv_ack_seq == ack && sk->window_seq == window_seq && !(flag&1))
 713         {
 714                 /*
 715                  * We only want to short cut this once, many
 716                  * ACKs may still come, we'll do a normal transmit
 717                  * for these ACKs.
 718                  */
 719                 if (++sk->rcv_ack_cnt == MAX_DUP_ACKS+1)
 720                         flag |= 8;      /* flag for a fast retransmit */
 721         }
 722         else
 723         {
 724                 sk->window_seq = window_seq;
 725                 sk->rcv_ack_seq = ack;
 726                 sk->rcv_ack_cnt = 1;
 727         }
 728         
 729         /*
 730          *      We passed data and got it acked, remove any soft error
 731          *      log. Something worked...
 732          */
 733          
 734         sk->err_soft = 0;
 735 
 736         /*
 737          *      If this ack opens up a zero window, clear backoff.  It was
 738          *      being used to time the probes, and is probably far higher than
 739          *      it needs to be for normal retransmission.
 740          */
 741 
 742         if (sk->ip_xmit_timeout == TIME_PROBE0) 
 743         {
 744                 sk->retransmits = 0;    /* Our probe was answered */
 745                 
 746                 /*
 747                  *      Was it a usable window open ?
 748                  */
 749                  
 750                 if (!skb_queue_empty(&sk->write_queue) &&   /* should always be true */
 751                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
 752                 {
 753                         sk->backoff = 0;
 754                         
 755                         /*
 756                          *      Recompute rto from rtt.  this eliminates any backoff.
 757                          */
 758 
 759                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 760                         if (sk->rto > 120*HZ)
 761                                 sk->rto = 120*HZ;
 762                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
 763                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
 764                                                    .2 of a second is going to need huge windows (SIGH) */
 765                         sk->rto = HZ/5;
 766                 }
 767         }
 768 
 769         /* 
 770          *      See if we can take anything off of the retransmit queue.
 771          */
 772 
 773         for (;;) {
 774                 struct sk_buff * skb = sk->send_head;
 775                 if (!skb)
 776                         break;
 777 
 778                 /* Check for a bug. */
 779                 if (skb->link3 && after(skb->end_seq, skb->link3->end_seq)) 
 780                         printk("INET: tcp.c: *** bug send_list out of order.\n");
 781                         
 782                 /*
 783                  *      If our packet is before the ack sequence we can
 784                  *      discard it as it's confirmed to have arrived the other end.
 785                  */
 786                  
 787                 if (after(skb->end_seq, ack))
 788                         break;
 789 
 790                 if (sk->retransmits) 
 791                 {       
 792                         /*
 793                          *      We were retransmitting.  don't count this in RTT est 
 794                          */
 795                         flag |= 2;
 796                 }
 797 
 798                 if ((sk->send_head = skb->link3) == NULL)
 799                 {
 800                         sk->send_tail = NULL;
 801                         sk->retransmits = 0;
 802                 }
 803                 /*
 804                  * Note that we only reset backoff and rto in the
 805                  * rtt recomputation code.  And that doesn't happen
 806                  * if there were retransmissions in effect.  So the
 807                  * first new packet after the retransmissions is
 808                  * sent with the backoff still in effect.  Not until
 809                  * we get an ack from a non-retransmitted packet do
 810                  * we reset the backoff and rto.  This allows us to deal
 811                  * with a situation where the network delay has increased
 812                  * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 813                  */
 814 
 815                 /*
 816                  *      We have one less packet out there. 
 817                  */
 818                          
 819                 if (sk->packets_out > 0) 
 820                         sk->packets_out --;
 821 
 822                 if (!(flag&2))  /* Not retransmitting */
 823                         tcp_rtt_estimator(sk,skb);
 824                 flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
 825                                    In this case as we just set it up */
 826                 IS_SKB(skb);
 827 
 828                 /*
 829                  *      We may need to remove this from the dev send list. 
 830                  */
 831                 cli();
 832                 if (skb->next)
 833                         skb_unlink(skb);
 834                 sti();
 835                 kfree_skb(skb, FREE_WRITE); /* write. */
 836                 if (!sk->dead)
 837                         sk->write_space(sk);
 838         }
 839 
 840         /*
 841          * XXX someone ought to look at this too.. at the moment, if skb_peek()
 842          * returns non-NULL, we complete ignore the timer stuff in the else
 843          * clause.  We ought to organize the code so that else clause can
 844          * (should) be executed regardless, possibly moving the PROBE timer
 845          * reset over.  The skb_peek() thing should only move stuff to the
 846          * write queue, NOT also manage the timer functions.
 847          */
 848 
 849         /*
 850          * Maybe we can take some stuff off of the write queue,
 851          * and put it onto the xmit queue.
 852          */
 853         if (skb_peek(&sk->write_queue) != NULL) 
 854         {
 855                 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
 856                         (sk->retransmits == 0 || 
 857                          sk->ip_xmit_timeout != TIME_WRITE ||
 858                          !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
 859                         && sk->packets_out < sk->cong_window) 
 860                 {
 861                         /*
 862                          *      Add more data to the send queue.
 863                          */
 864                         flag |= 1;
 865                         tcp_write_xmit(sk);
 866                 }
 867                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
 868                         sk->send_head == NULL &&
 869                         sk->ack_backlog == 0 &&
 870                         sk->state != TCP_TIME_WAIT) 
 871                 {
 872                         /*
 873                          *      Data to queue but no room.
 874                          */
 875                         tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
 876                 }               
 877         }
 878         else
 879         {
 880                 /*
 881                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
 882                  * from TCP_CLOSE we don't do anything
 883                  *
 884                  * from anything else, if there is write data (or fin) pending,
 885                  * we use a TIME_WRITE timeout, else if keepalive we reset to
 886                  * a KEEPALIVE timeout, else we delete the timer.
 887                  *
 888                  * We do not set flag for nominal write data, otherwise we may
 889                  * force a state where we start to write itsy bitsy tidbits
 890                  * of data.
 891                  */
 892 
 893                 switch(sk->state) {
 894                 case TCP_TIME_WAIT:
 895                         /*
 896                          * keep us in TIME_WAIT until we stop getting packets,
 897                          * reset the timeout.
 898                          */
 899                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 900                         break;
 901                 case TCP_CLOSE:
 902                         /*
 903                          * don't touch the timer.
 904                          */
 905                         break;
 906                 default:
 907                         /*
 908                          *      Must check send_head and write_queue
 909                          *      to determine which timeout to use.
 910                          */
 911                         if (sk->send_head || !skb_queue_empty(&sk->write_queue)) {
 912                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 913                         } else if (sk->keepopen) {
 914                                 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 915                         } else {
 916                                 del_timer(&sk->retransmit_timer);
 917                                 sk->ip_xmit_timeout = 0;
 918                         }
 919                         break;
 920                 }
 921         }
 922 
 923         /*
 924          *      We have nothing queued but space to send. Send any partial
 925          *      packets immediately (end of Nagle rule application).
 926          */
 927          
 928         if (sk->packets_out == 0
 929             && sk->partial != NULL
 930             && skb_queue_empty(&sk->write_queue)
 931             && sk->send_head == NULL) 
 932         {
 933                 flag |= 1;
 934                 tcp_send_partial(sk);
 935         }
 936 
 937         /*
 938          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
 939          * we are now waiting for an acknowledge to our FIN.  The other end is
 940          * already in TIME_WAIT.
 941          *
 942          * Move to TCP_CLOSE on success.
 943          */
 944 
 945         if (sk->state == TCP_LAST_ACK) 
 946         {
 947                 if (!sk->dead)
 948                         sk->state_change(sk);
 949                 if(sk->debug)
 950                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
 951                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
 952                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
 953                 {
 954                         flag |= 1;
 955                         sk->shutdown = SHUTDOWN_MASK;
 956                         tcp_set_state(sk,TCP_CLOSE);
 957                         return 1;
 958                 }
 959         }
 960 
 961         /*
 962          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
 963          *
 964          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
 965          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
 966          */
 967 
 968         if (sk->state == TCP_FIN_WAIT1) 
 969         {
 970 
 971                 if (!sk->dead) 
 972                         sk->state_change(sk);
 973                 if (sk->rcv_ack_seq == sk->write_seq) 
 974                 {
 975                         flag |= 1;
 976                         sk->shutdown |= SEND_SHUTDOWN;
 977                         tcp_set_state(sk, TCP_FIN_WAIT2);
 978                 }
 979         }
 980 
 981         /*
 982          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
 983          *
 984          *      Move to TIME_WAIT
 985          */
 986 
 987         if (sk->state == TCP_CLOSING) 
 988         {
 989 
 990                 if (!sk->dead) 
 991                         sk->state_change(sk);
 992                 if (sk->rcv_ack_seq == sk->write_seq) 
 993                 {
 994                         flag |= 1;
 995                         tcp_time_wait(sk);
 996                 }
 997         }
 998         
 999         /*
1000          *      Final ack of a three way shake 
1001          */
1002          
1003         if(sk->state==TCP_SYN_RECV)
1004         {
1005                 tcp_set_state(sk, TCP_ESTABLISHED);
1006                 tcp_options(sk,th);
1007                 sk->dummy_th.dest=th->source;
1008                 sk->copied_seq = sk->acked_seq;
1009                 if(!sk->dead)
1010                         sk->state_change(sk);
1011                 if(sk->max_window==0)
1012                 {
1013                         sk->max_window=32;      /* Sanity check */
1014                         sk->mss=min(sk->max_window,sk->mtu);
1015                 }
1016         }
1017         
1018         /*
1019          * I make no guarantees about the first clause in the following
1020          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
1021          * what conditions "!flag" would be true.  However I think the rest
1022          * of the conditions would prevent that from causing any
1023          * unnecessary retransmission. 
1024          *   Clearly if the first packet has expired it should be 
1025          * retransmitted.  The other alternative, "flag&2 && retransmits", is
1026          * harder to explain:  You have to look carefully at how and when the
1027          * timer is set and with what timeout.  The most recent transmission always
1028          * sets the timer.  So in general if the most recent thing has timed
1029          * out, everything before it has as well.  So we want to go ahead and
1030          * retransmit some more.  If we didn't explicitly test for this
1031          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1032          * would not be true.  If you look at the pattern of timing, you can
1033          * show that rto is increased fast enough that the next packet would
1034          * almost never be retransmitted immediately.  Then you'd end up
1035          * waiting for a timeout to send each packet on the retransmission
1036          * queue.  With my implementation of the Karn sampling algorithm,
1037          * the timeout would double each time.  The net result is that it would
1038          * take a hideous amount of time to recover from a single dropped packet.
1039          * It's possible that there should also be a test for TIME_WRITE, but
1040          * I think as long as "send_head != NULL" and "retransmit" is on, we've
1041          * got to be in real retransmission mode.
1042          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
1043          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1044          * As long as no further losses occur, this seems reasonable.
1045          */
1046         
1047         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1048                (((flag&2) && sk->retransmits) ||
1049                 (flag&8) ||
1050                (sk->send_head->when + sk->rto < jiffies))) 
1051         {
1052                 if(sk->send_head->when + sk->rto < jiffies)
1053                         tcp_retransmit(sk,0);   
1054                 else
1055                 {
1056                         tcp_do_retransmit(sk, 1);
1057                         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1058                 }
1059         }
1060 
1061         return 1;
1062 
1063 uninteresting_ack:
1064         if(sk->debug)
1065                 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1066                         
1067         /*
1068          *      Keepalive processing.
1069          */
1070                  
1071         if (after(ack, sk->sent_seq)) 
1072         {
1073                 return 0;
1074         }
1075                 
1076         /*
1077          *      Restart the keepalive timer.
1078          */
1079                  
1080         if (sk->keepopen) 
1081         {
1082                 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1083                         tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1084         }
1085         return 1;
1086 }
1087 
1088 
1089 /*
1090  *      Process the FIN bit. This now behaves as it is supposed to work
1091  *      and the FIN takes effect when it is validly part of sequence
1092  *      space. Not before when we get holes.
1093  *
1094  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1095  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1096  *      TIME-WAIT)
1097  *
1098  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1099  *      close and we go into CLOSING (and later onto TIME-WAIT)
1100  *
1101  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1102  *
1103  */
1104  
1105 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
1106 {
1107         sk->fin_seq = skb->end_seq;
1108 
1109         if (!sk->dead) 
1110         {
1111                 sk->state_change(sk);
1112                 sock_wake_async(sk->socket, 1);
1113         }
1114 
1115         switch(sk->state) 
1116         {
1117                 case TCP_SYN_RECV:
1118                 case TCP_SYN_SENT:
1119                 case TCP_ESTABLISHED:
1120                         /*
1121                          * move to CLOSE_WAIT, tcp_data() already handled
1122                          * sending the ack.
1123                          */
1124                         tcp_set_state(sk,TCP_CLOSE_WAIT);
1125                         if (th->rst)
1126                                 sk->shutdown = SHUTDOWN_MASK;
1127                         break;
1128 
1129                 case TCP_CLOSE_WAIT:
1130                 case TCP_CLOSING:
1131                         /*
1132                          * received a retransmission of the FIN, do
1133                          * nothing.
1134                          */
1135                         break;
1136                 case TCP_TIME_WAIT:
1137                         /*
1138                          * received a retransmission of the FIN,
1139                          * restart the TIME_WAIT timer.
1140                          */
1141                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1142                         return(0);
1143                 case TCP_FIN_WAIT1:
1144                         /*
1145                          * This case occurs when a simultaneous close
1146                          * happens, we must ack the received FIN and
1147                          * enter the CLOSING state.
1148                          *
1149                          * This causes a WRITE timeout, which will either
1150                          * move on to TIME_WAIT when we timeout, or resend
1151                          * the FIN properly (maybe we get rid of that annoying
1152                          * FIN lost hang). The TIME_WRITE code is already correct
1153                          * for handling this timeout.
1154                          */
1155 
1156                         if(sk->ip_xmit_timeout != TIME_WRITE)
1157                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1158                         tcp_set_state(sk,TCP_CLOSING);
1159                         break;
1160                 case TCP_FIN_WAIT2:
1161                         /*
1162                          * received a FIN -- send ACK and enter TIME_WAIT
1163                          */
1164                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1165                         sk->shutdown|=SHUTDOWN_MASK;
1166                         tcp_set_state(sk,TCP_TIME_WAIT);
1167                         break;
1168                 case TCP_CLOSE:
1169                         /*
1170                          * already in CLOSE
1171                          */
1172                         break;
1173                 default:
1174                         tcp_set_state(sk,TCP_LAST_ACK);
1175         
1176                         /* Start the timers. */
1177                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1178                         return(0);
1179         }
1180 
1181         return(0);
1182 }
1183 
1184 /*
1185  * Add a sk_buff to the TCP receive queue, calculating
1186  * the ACK sequence as we go..
1187  */
1188 static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list)
     /*  */
1189 {
1190         struct sk_buff * prev, * next;
1191         u32 seq;
1192 
1193         /*
1194          * Find where the new skb goes.. (This goes backwards,
1195          * on the assumption that we get the packets in order)
1196          */
1197         seq = skb->seq;
1198         prev = list->prev;
1199         next = (struct sk_buff *) list;
1200         for (;;) {
1201                 if (prev == (struct sk_buff *) list || !after(prev->seq, seq))
1202                         break;
1203                 next = prev;
1204                 prev = prev->prev;
1205         }
1206         __skb_insert(skb, prev, next, list);
1207 }
1208 
1209 /*
1210  * Called for each packet when we find a new ACK endpoint sequence in it
1211  */
1212 static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
     /*  */
1213 {
1214         /*
1215          *      When we ack the fin, we do the FIN 
1216          *      processing.
1217          */
1218         skb->acked = 1;
1219         if (skb->h.th->fin)
1220                 tcp_fin(skb,sk,skb->h.th);
1221         return skb->end_seq;
1222 }       
1223 
1224 static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th)
     /*  */
1225 {
1226         u32 ack_seq;
1227 
1228         tcp_insert_skb(skb, &sk->receive_queue);
1229 
1230         /*
1231          * Did we get anything new to ack?
1232          */
1233         ack_seq = sk->acked_seq;
1234 
1235 
1236         if (!after(skb->seq, ack_seq)) {
1237                 if (after(skb->end_seq, ack_seq)) {
1238                         /* the packet straddles our window end */
1239                         struct sk_buff_head * list = &sk->receive_queue;
1240                         struct sk_buff * next;
1241                         ack_seq = tcp_queue_ack(skb, sk);
1242 
1243                         /*
1244                          * Do we have any old packets to ack that the above
1245                          * made visible? (Go forward from skb)
1246                          */
1247                         next = skb->next;
1248                         while (next != (struct sk_buff *) list) {
1249                                 if (after(next->seq, ack_seq))
1250                                         break;
1251                                 if (after(next->end_seq, ack_seq))
1252                                         ack_seq = tcp_queue_ack(next, sk);
1253                                 next = next->next;
1254                         }
1255 
1256                         /*
1257                          * Ok, we found new data, update acked_seq as
1258                          * necessary (and possibly send the actual
1259                          * ACK packet).
1260                          */
1261                         sk->acked_seq = ack_seq;
1262 
1263                 } else {
1264                         if (sk->debug)
1265                                 printk("Ack duplicate packet.\n");
1266                         tcp_send_ack(sk);
1267                         return;
1268                 }
1269 
1270 
1271                 /*
1272                  * Delay the ack if possible.  Send ack's to
1273                  * fin frames immediately as there shouldn't be
1274                  * anything more to come.
1275                  */
1276                 if (!sk->delay_acks || th->fin) {
1277                         tcp_send_ack(sk);
1278                 } else {
1279                         /*
1280                          * If psh is set we assume it's an
1281                          * interactive session that wants quick
1282                          * acks to avoid nagling too much. 
1283                          */
1284                         int delay = HZ/2;
1285                         if (th->psh)
1286                                 delay = HZ/50;
1287                         tcp_send_delayed_ack(sk, delay);
1288                 }
1289 
1290                 /*
1291                  *      Tell the user we have some more data.
1292                  */
1293 
1294                 if (!sk->dead)
1295                         sk->data_ready(sk,0);
1296 
1297         }
1298         else
1299         {
1300             /*
1301              *  If we've missed a packet, send an ack.
1302              *  Also start a timer to send another.
1303              *
1304              *  4.3reno machines look for these kind of acks so
1305              *  they can do fast recovery. Three identical 'old'
1306              *  acks lets it know that one frame has been lost
1307              *      and should be resent. Because this is before the
1308              *  whole window of data has timed out it can take
1309              *  one lost frame per window without stalling.
1310              *  [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
1311              *
1312              *  We also should be spotting triple bad sequences.
1313              *  [We now do this.]
1314              *
1315              */
1316              
1317             if (!skb->acked) 
1318             {
1319                     if(sk->debug)
1320                             printk("Ack past end of seq packet.\n");
1321                     tcp_send_ack(sk);
1322                     tcp_send_delayed_ack(sk,HZ/2);
1323             }
1324         }
1325 }
1326 
1327 
1328 /*
1329  *      This routine handles the data.  If there is room in the buffer,
1330  *      it will be have already been moved into it.  If there is no
1331  *      room, then we will just have to discard the packet.
1332  */
1333 
1334 static int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
1335          unsigned long saddr, unsigned int len)
1336 {
1337         struct tcphdr *th;
1338         u32 new_seq, shut_seq;
1339 
1340         th = skb->h.th;
1341         skb_pull(skb,th->doff*4);
1342         skb_trim(skb,len-(th->doff*4));
1343 
1344         /*
1345          *      The bytes in the receive read/assembly queue has increased. Needed for the
1346          *      low memory discard algorithm 
1347          */
1348            
1349         sk->bytes_rcv += skb->len;
1350         
1351         if (skb->len == 0 && !th->fin) 
1352         {
1353                 /* 
1354                  *      Don't want to keep passing ack's back and forth. 
1355                  *      (someone sent us dataless, boring frame)
1356                  */
1357                 if (!th->ack)
1358                         tcp_send_ack(sk);
1359                 kfree_skb(skb, FREE_READ);
1360                 return(0);
1361         }
1362         
1363         /*
1364          *      We no longer have anyone receiving data on this connection.
1365          */
1366 
1367 #ifndef TCP_DONT_RST_SHUTDOWN            
1368 
1369         if(sk->shutdown & RCV_SHUTDOWN)
1370         {
1371                 /*
1372                  *      FIXME: BSD has some magic to avoid sending resets to
1373                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
1374                  *      BSD stacks still have broken keepalives so we want to
1375                  *      cope with it.
1376                  */
1377 
1378                 if(skb->len)    /* We don't care if it's just an ack or
1379                                    a keepalive/window probe */
1380                 {
1381                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
1382                         
1383                         /* Do this the way 4.4BSD treats it. Not what I'd
1384                            regard as the meaning of the spec but it's what BSD
1385                            does and clearly they know everything 8) */
1386 
1387                         /*
1388                          *      This is valid because of two things
1389                          *
1390                          *      a) The way tcp_data behaves at the bottom.
1391                          *      b) A fin takes effect when read not when received.
1392                          */
1393                          
1394                         shut_seq = sk->acked_seq+1;     /* Last byte */
1395                         
1396                         if(after(new_seq,shut_seq))
1397                         {
1398                                 if(sk->debug)
1399                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1400                                                 sk, new_seq, shut_seq, sk->blog);
1401                                 if(sk->dead)
1402                                 {
1403                                         sk->acked_seq = new_seq + th->fin;
1404                                         tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1405                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1406                                         tcp_statistics.TcpEstabResets++;
1407                                         sk->err = EPIPE;
1408                                         sk->error_report(sk);
1409                                         sk->shutdown = SHUTDOWN_MASK;
1410                                         tcp_set_state(sk,TCP_CLOSE);
1411                                         kfree_skb(skb, FREE_READ);
1412                                         return 0;
1413                                 }
1414                         }
1415                 }
1416         }
1417 
1418 #endif
1419 
1420         tcp_queue(skb, sk, th);
1421 
1422         return(0);
1423 }
1424 
1425 
1426 /*
1427  *      This routine is only called when we have urgent data
1428  *      signalled. Its the 'slow' part of tcp_urg. It could be
1429  *      moved inline now as tcp_urg is only called from one
1430  *      place. We handle URGent data wrong. We have to - as
1431  *      BSD still doesn't use the correction from RFC961.
1432  */
1433  
1434 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
1435 {
1436         u32 ptr = ntohs(th->urg_ptr);
1437 
1438         if (ptr)
1439                 ptr--;
1440         ptr += ntohl(th->seq);
1441 
1442         /* ignore urgent data that we've already seen and read */
1443         if (after(sk->copied_seq, ptr))
1444                 return;
1445 
1446         /* do we already have a newer (or duplicate) urgent pointer? */
1447         if (sk->urg_data && !after(ptr, sk->urg_seq))
1448                 return;
1449 
1450         /* tell the world about our new urgent pointer */
1451         if (sk->proc != 0) {
1452                 if (sk->proc > 0) {
1453                         kill_proc(sk->proc, SIGURG, 1);
1454                 } else {
1455                         kill_pg(-sk->proc, SIGURG, 1);
1456                 }
1457         }
1458         sk->urg_data = URG_NOTYET;
1459         sk->urg_seq = ptr;
1460 }
1461 
1462 /*
1463  *      This is the 'fast' part of urgent handling.
1464  */
1465  
1466 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
     /*  */
1467 {
1468         /*
1469          *      Check if we get a new urgent pointer - normally not 
1470          */
1471          
1472         if (th->urg)
1473                 tcp_check_urg(sk,th);
1474 
1475         /*
1476          *      Do we wait for any urgent data? - normally not
1477          */
1478          
1479         if (sk->urg_data == URG_NOTYET) {
1480                 u32 ptr;
1481 
1482                 /*
1483                  *      Is the urgent pointer pointing into this packet? 
1484                  */      
1485                 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1486                 if (ptr < len) {
1487                         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1488                         if (!sk->dead)
1489                                 sk->data_ready(sk,0);
1490                 }
1491         }
1492 }
1493 
1494 /*
1495  * This should be a bit smarter and remove partially
1496  * overlapping stuff too, but this should be good
1497  * enough for any even remotely normal case (and the
1498  * worst that can happen is that we have a few
1499  * unnecessary packets in the receive queue).
1500  *
1501  * This function is never called with an empty list..
1502  */
1503 static inline void tcp_remove_dups(struct sk_buff_head * list)
     /*  */
1504 {
1505         struct sk_buff * next = list->next;
1506 
1507         for (;;) {
1508                 struct sk_buff * skb = next;
1509                 next = next->next;
1510                 if (next == (struct sk_buff *) list)
1511                         break;
1512                 if (before(next->end_seq, skb->end_seq)) {
1513                         __skb_unlink(next, list);
1514                         kfree_skb(next, FREE_READ);
1515                         next = skb;
1516                         continue;
1517                 }
1518                 if (next->seq != skb->seq)
1519                         continue;
1520                 __skb_unlink(skb, list);
1521                 kfree_skb(skb, FREE_READ);
1522         }
1523 }
1524 
1525 /*
1526  * Throw out all unnecessary packets: we've gone over the
1527  * receive queue limit. This shouldn't happen in a normal
1528  * TCP connection, but we might have gotten duplicates etc.
1529  */
1530 static void prune_queue(struct sk_buff_head * list)
     /*  */
1531 {
1532         for (;;) {
1533                 struct sk_buff * skb = list->prev;
1534 
1535                 /* gone through it all? */
1536                 if (skb == (struct sk_buff *) list)
1537                         break;
1538                 if (!skb->acked) {
1539                         __skb_unlink(skb, list);
1540                         kfree_skb(skb, FREE_READ);
1541                         continue;
1542                 }
1543                 tcp_remove_dups(list);
1544                 break;
1545         }
1546 }
1547 
1548 /*
1549  *      A TCP packet has arrived.
1550  *              skb->h.raw is the TCP header.
1551  */
1552  
1553 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
1554         __u32 daddr, unsigned short len,
1555         __u32 saddr, int redo, struct inet_protocol * protocol)
1556 {
1557         struct tcphdr *th;
1558         struct sock *sk;
1559         int syn_ok=0;
1560 
1561         /*
1562          * "redo" is 1 if we have already seen this skb but couldn't
1563          * use it at that time (the socket was locked).  In that case
1564          * we have already done a lot of the work (looked up the socket
1565          * etc).
1566          */
1567         th = skb->h.th;
1568         sk = skb->sk;
1569         if (!redo) {
1570                 tcp_statistics.TcpInSegs++;
1571                 if (skb->pkt_type!=PACKET_HOST)
1572                         goto discard_it;
1573 
1574                 /*
1575                  *      Pull up the IP header.
1576                  */
1577         
1578                 skb_pull(skb, skb->h.raw-skb->data);
1579 
1580                 /*
1581                  *      Try to use the device checksum if provided.
1582                  */
1583                 switch (skb->ip_summed) 
1584                 {
1585                         case CHECKSUM_NONE:
1586                                 skb->csum = csum_partial((char *)th, len, 0);
1587                         case CHECKSUM_HW:
1588                                 if (tcp_check(th, len, saddr, daddr, skb->csum))
1589                                         goto discard_it;
1590                         default:
1591                                 /* CHECKSUM_UNNECESSARY */
1592                 }
1593                 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1594                 if (!sk)
1595                         goto no_tcp_socket;
1596                 skb->sk = sk;
1597                 skb->seq = ntohl(th->seq);
1598                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1599                 skb->ack_seq = ntohl(th->ack_seq);
1600 
1601                 skb->acked = 0;
1602                 skb->used = 0;
1603                 skb->free = 1;
1604                 skb->saddr = daddr;
1605                 skb->daddr = saddr;
1606 
1607                 /*
1608                  * We may need to add it to the backlog here. 
1609                  */
1610                 if (sk->users) 
1611                 {
1612                         __skb_queue_tail(&sk->back_log, skb);
1613                         return(0);
1614                 }
1615         }
1616 
1617         /*
1618          *      If this socket has got a reset it's to all intents and purposes 
1619          *      really dead. Count closed sockets as dead.
1620          *
1621          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1622          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
1623          *      exist so should cause resets as if the port was unreachable.
1624          */
1625 
1626         if (sk->zapped || sk->state==TCP_CLOSE)
1627                 goto no_tcp_socket;
1628 
1629         if (!sk->prot) 
1630         {
1631                 printk("IMPOSSIBLE 3\n");
1632                 return(0);
1633         }
1634 
1635 
1636         /*
1637          *      Charge the memory to the socket. 
1638          */
1639          
1640         skb->sk=sk;
1641         atomic_add(skb->truesize, &sk->rmem_alloc);
1642         
1643         /*
1644          *      We should now do header prediction.
1645          */
1646          
1647         /*
1648          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1649          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1650          *      compatibility. We also set up variables more thoroughly [Karn notes in the
1651          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1652          */
1653 
1654         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
1655         {
1656         
1657                 /*
1658                  *      Now deal with unusual cases.
1659                  */
1660          
1661                 if(sk->state==TCP_LISTEN)
1662                 {
1663                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
1664                                 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1665 
1666                         /*
1667                          *      We don't care for RST, and non SYN are absorbed (old segments)
1668                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1669                          *      netmask on a running connection it can go broadcast. Even Sun's have
1670                          *      this problem so I'm ignoring it 
1671                          */
1672                            
1673                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1674                         {
1675                                 kfree_skb(skb, FREE_READ);
1676                                 return 0;
1677                         }
1678                 
1679                         /*      
1680                          *      Guess we need to make a new socket up 
1681                          */
1682                 
1683                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1684                 
1685                         /*
1686                          *      Now we have several options: In theory there is nothing else
1687                          *      in the frame. KA9Q has an option to send data with the syn,
1688                          *      BSD accepts data with the syn up to the [to be] advertised window
1689                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
1690                          *      it, that fits the spec precisely and avoids incompatibilities. It
1691                          *      would be nice in future to drop through and process the data.
1692                          *
1693                          *      Now TTCP is starting to use we ought to queue this data.
1694                          */
1695                          
1696                         return 0;
1697                 }
1698         
1699                 /* 
1700                  *      Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1701                  *      then its a new connection
1702                  */
1703                  
1704                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1705                 {
1706                         kfree_skb(skb, FREE_READ);
1707                         return 0;
1708                 }
1709                 
1710                 /*
1711                  *      SYN sent means we have to look for a suitable ack and either reset
1712                  *      for bad matches or go to connected. The SYN_SENT case is unusual and should
1713                  *      not be in line code. [AC]
1714                  */
1715            
1716                 if(sk->state==TCP_SYN_SENT)
1717                 {
1718                         /* Crossed SYN or previous junk segment */
1719                         if(th->ack)
1720                         {
1721                                 /* We got an ack, but it's not a good ack */
1722                                 if(!tcp_ack(sk,th,skb->ack_seq,len))
1723                                 {
1724                                         /* Reset the ack - its an ack from a 
1725                                            different connection  [ th->rst is checked in tcp_send_reset()] */
1726                                         tcp_statistics.TcpAttemptFails++;
1727                                         tcp_send_reset(daddr, saddr, th,
1728                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1729                                         kfree_skb(skb, FREE_READ);
1730                                         return(0);
1731                                 }
1732                                 if(th->rst)
1733                                         return tcp_reset(sk,skb);
1734                                 if(!th->syn)
1735                                 {
1736                                         /* A valid ack from a different connection
1737                                            start. Shouldn't happen but cover it */
1738                                         tcp_statistics.TcpAttemptFails++;
1739                                         tcp_send_reset(daddr, saddr, th,
1740                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1741                                         kfree_skb(skb, FREE_READ);
1742                                         return 0;
1743                                 }
1744                                 /*
1745                                  *      Ok.. it's good. Set up sequence numbers and
1746                                  *      move to established.
1747                                  */
1748                                 syn_ok=1;       /* Don't reset this connection for the syn */
1749                                 sk->acked_seq = skb->seq+1;
1750                                 sk->lastwin_seq = skb->seq+1;
1751                                 sk->fin_seq = skb->seq;
1752                                 tcp_send_ack(sk);
1753                                 tcp_set_state(sk, TCP_ESTABLISHED);
1754                                 tcp_options(sk,th);
1755                                 sk->dummy_th.dest=th->source;
1756                                 sk->copied_seq = sk->acked_seq;
1757                                 if(!sk->dead)
1758                                 {
1759                                         sk->state_change(sk);
1760                                         sock_wake_async(sk->socket, 0);
1761                                 }
1762                                 if(sk->max_window==0)
1763                                 {
1764                                         sk->max_window = 32;
1765                                         sk->mss = min(sk->max_window, sk->mtu);
1766                                 }
1767                         }
1768                         else
1769                         {
1770                                 /* See if SYN's cross. Drop if boring */
1771                                 if(th->syn && !th->rst)
1772                                 {
1773                                         /* Crossed SYN's are fine - but talking to
1774                                            yourself is right out... */
1775                                         if(sk->saddr==saddr && sk->daddr==daddr &&
1776                                                 sk->dummy_th.source==th->source &&
1777                                                 sk->dummy_th.dest==th->dest)
1778                                         {
1779                                                 tcp_statistics.TcpAttemptFails++;
1780                                                 return tcp_reset(sk,skb);
1781                                         }
1782                                         tcp_set_state(sk,TCP_SYN_RECV);
1783                                         
1784                                         /*
1785                                          *      FIXME:
1786                                          *      Must send SYN|ACK here
1787                                          */
1788                                 }               
1789                                 /* Discard junk segment */
1790                                 kfree_skb(skb, FREE_READ);
1791                                 return 0;
1792                         }
1793                         /*
1794                          *      SYN_RECV with data maybe.. drop through
1795                          */
1796                         goto rfc_step6;
1797                 }
1798 
1799         /*
1800          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1801          *      a more complex suggestion for fixing these reuse issues in RFC1644
1802          *      but not yet ready for general use. Also see RFC1379.
1803          *
1804          *      Note the funny way we go back to the top of this function for
1805          *      this case ("goto try_next_socket").  That also takes care of
1806          *      checking "sk->users" for the new socket as well as doing all
1807          *      the normal tests on the packet.
1808          */
1809         
1810 #define BSD_TIME_WAIT
1811 #ifdef BSD_TIME_WAIT
1812                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
1813                         after(skb->seq, sk->acked_seq) && !th->rst)
1814                 {
1815                         u32 seq = sk->write_seq;
1816                         if(sk->debug)
1817                                 printk("Doing a BSD time wait\n");
1818                         tcp_statistics.TcpEstabResets++;           
1819                         atomic_sub(skb->truesize, &sk->rmem_alloc);
1820                         skb->sk = NULL;
1821                         sk->err=ECONNRESET;
1822                         tcp_set_state(sk, TCP_CLOSE);
1823                         sk->shutdown = SHUTDOWN_MASK;
1824                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1825                         /* this is not really correct: we should check sk->users */
1826                         if (sk && sk->state==TCP_LISTEN)
1827                         {
1828                                 skb->sk = sk;
1829                                 atomic_add(skb->truesize, &sk->rmem_alloc);
1830                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1831                                 return 0;
1832                         }
1833                         kfree_skb(skb, FREE_READ);
1834                         return 0;
1835                 }
1836 #endif  
1837         }
1838 
1839         /*
1840          *      We are now in normal data flow (see the step list in the RFC)
1841          *      Note most of these are inline now. I'll inline the lot when
1842          *      I have time to test it hard and look at what gcc outputs 
1843          */
1844         
1845         if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1846         {
1847                 bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
1848                 kfree_skb(skb, FREE_READ);
1849                 return 0;
1850         }
1851 
1852         if(th->rst)
1853                 return tcp_reset(sk,skb);
1854         
1855         /*
1856          *      !syn_ok is effectively the state test in RFC793.
1857          */
1858          
1859         if(th->syn && !syn_ok)
1860         {
1861                 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1862                 return tcp_reset(sk,skb);       
1863         }
1864 
1865         tcp_delack_estimator(sk);
1866         
1867         /*
1868          *      Process the ACK
1869          */
1870          
1871 
1872         if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1873         {
1874                 /*
1875                  *      Our three way handshake failed.
1876                  */
1877                  
1878                 if(sk->state==TCP_SYN_RECV)
1879                 {
1880                         tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1881                 }
1882                 kfree_skb(skb, FREE_READ);
1883                 return 0;
1884         }
1885         
1886 rfc_step6:              /* I'll clean this up later */
1887 
1888         /*
1889          *      If the accepted buffer put us over our queue size we
1890          *      now drop it (we must process the ack first to avoid
1891          *      deadlock cases).
1892          */
1893 
1894         /*
1895          *      Process urgent data
1896          */
1897                 
1898         tcp_urg(sk, th, len);
1899         
1900         /*
1901          *      Process the encapsulated data
1902          */
1903         
1904         if(tcp_data(skb,sk, saddr, len))
1905                 kfree_skb(skb, FREE_READ);
1906 
1907         /*
1908          *      If our receive queue has grown past its limits,
1909          *      try to prune away duplicates etc..
1910          */
1911         if (sk->rmem_alloc > sk->rcvbuf)
1912                 prune_queue(&sk->receive_queue);
1913 
1914         /*
1915          *      And done
1916          */     
1917         
1918         return 0;
1919 
1920 no_tcp_socket:
1921         /*
1922          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1923          */
1924         tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1925 
1926 discard_it:
1927         /*
1928          *      Discard frame
1929          */
1930         skb->sk = NULL;
1931         kfree_skb(skb, FREE_READ);
1932         return 0;
1933 }
/* */
root/net/ipv4/tcp_input.c

DEFINITIONS