net/ipv4/tcp

/* */
This source file includes following definitions.
tcp_delack_estimator
tcp_rtt_estimator
tcp_cache_zap
get_tcp_sock
bad_tcp_sequence
tcp_sequence
tcp_reset
tcp_options
tcp_conn_request
tcp_window_shrunk
tcp_ack
tcp_fin
tcp_insert_skb
tcp_queue_ack
tcp_queue
tcp_data
tcp_check_urg
tcp_urg
tcp_remove_dups
prune_queue
tcp_rcv
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp_input.c 1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * FIXES
  23  *              Pedro Roque     :       Double ACK bug
  24  */
  25 
  26 #include <linux/config.h>
  27 #include <net/tcp.h>
  28 
  29 /*
  30  *      Policy code extracted so its now separate
  31  */
  32 
  33 /*
  34  *      Called each time to estimate the delayed ack timeout. This is
  35  *      how it should be done so a fast link isn't impacted by ack delay.
  36  */
  37  
  38 extern __inline__ void tcp_delack_estimator(struct sock *sk)
     /*  */
  39 {
  40         /*
  41          *      Delayed ACK time estimator.
  42          */
  43         
  44         if (sk->lrcvtime == 0) 
  45         {
  46                 sk->lrcvtime = jiffies;
  47                 sk->ato = HZ/3;
  48         }
  49         else 
  50         {
  51                 int m;
  52                 
  53                 m = jiffies - sk->lrcvtime;
  54 
  55                 sk->lrcvtime = jiffies;
  56 
  57                 if (m <= 0)
  58                         m = 1;
  59 
  60                 if (m > (sk->rtt >> 3)) 
  61                 {
  62                         sk->ato = sk->rtt >> 3;
  63                         /*
  64                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
  65                          */
  66                 }
  67                 else 
  68                 {
  69                         sk->ato = (sk->ato >> 1) + m;
  70                         /*
  71                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
  72                          */
  73                 }
  74         }
  75 }
  76 
  77 /*
  78  *      Called on frames that were known _not_ to have been
  79  *      retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
  80  *      The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
  81  */
  82  
  83 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
     /*  */
  84 {
  85         long m;
  86         /*
  87          *      The following amusing code comes from Jacobson's
  88          *      article in SIGCOMM '88.  Note that rtt and mdev
  89          *      are scaled versions of rtt and mean deviation.
  90          *      This is designed to be as fast as possible 
  91          *      m stands for "measurement".
  92          */
  93         
  94         m = jiffies - oskb->when;  /* RTT */
  95         if(m<=0)
  96                 m=1;            /* IS THIS RIGHT FOR <0 ??? */
  97         m -= (sk->rtt >> 3);    /* m is now error in rtt est */
  98         sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
  99         if (m < 0)
 100                 m = -m;         /* m is now abs(error) */
 101         m -= (sk->mdev >> 2);   /* similar update on mdev */
 102         sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 103 
 104         /*
 105          *      Now update timeout.  Note that this removes any backoff.
 106          */
 107                          
 108         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 109         if (sk->rto > 120*HZ)
 110                 sk->rto = 120*HZ;
 111         if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
 112                 sk->rto = HZ/5;
 113         sk->backoff = 0;
 114 }
 115 
 116 /*
 117  *      Cached last hit socket
 118  */
 119  
 120 static volatile unsigned long   th_cache_saddr, th_cache_daddr;
 121 static volatile unsigned short  th_cache_dport, th_cache_sport;
 122 static volatile struct sock *th_cache_sk;
 123 
 124 void tcp_cache_zap(void)
     /*  */
 125 {
 126         th_cache_sk=NULL;
 127 }
 128 
 129 /*
 130  *      Find the socket, using the last hit cache if applicable. The cache is not quite
 131  *      right...
 132  */
 133 
 134 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
     /*  */
 135 {
 136         struct sock * sk;
 137 
 138         sk = (struct sock *) th_cache_sk;
 139         if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
 140             sport != th_cache_sport || dport != th_cache_dport) {
 141                 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
 142                 if (sk) {
 143                         th_cache_saddr=saddr;
 144                         th_cache_daddr=daddr;
 145                         th_cache_dport=dport;
 146                         th_cache_sport=sport;
 147                         th_cache_sk=sk;
 148                 }
 149         }
 150         return sk;
 151 }
 152 
 153 /*
 154  * React to a out-of-window TCP sequence number in an incoming packet
 155  */
 156  
 157 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, u32 end_seq,
     /*  */
 158               struct device *dev)
 159 {
 160         if (th->rst)
 161                 return;
 162 
 163         /*
 164          *      Send a reset if we get something not ours and we are
 165          *      unsynchronized. Note: We don't do anything to our end. We
 166          *      are just killing the bogus remote connection then we will
 167          *      connect again and it will work (with luck).
 168          */
 169          
 170         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
 171         {
 172                 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
 173                 return;
 174         }
 175         
 176         /* 
 177          * We got out of sequence data.
 178          * This turns out to be tricky. If the packet ends at the
 179          * edge of the window, then we MUST ack the packet,
 180          * otherwise a lost ACK packet can stall the TCP.
 181          * We deal with this case in tcp_queue().
 182          * On the other hand, if the packet is further to the
 183          * left of the window, then we are looking a retransmitted
 184          * packet. If we ACK it we can get into a situation that
 185          * will later induce a fast retransmit of another packet.
 186          * This can end up eating up half our bandwidth.
 187          */
 188 
 189         /* This case is NOT supposed to be able
 190          * to happen. Test for it?
 191          */
 192         if (sk->acked_seq == end_seq)
 193                 printk("Impossible out of sequence data case.\n");
 194         return;
 195 }
 196 
 197 /*
 198  *      This functions checks to see if the tcp header is actually acceptable. 
 199  */
 200  
 201 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
     /*  */
 202 {
 203         u32 end_window = sk->acked_seq + sk->window;
 204         return  /* if start is at end of window, end must be too (zero window) */
 205                 (seq == end_window && seq == end_seq) ||
 206                 /* if start is before end of window, check for interest */
 207                 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
 208 }
 209 
 210 /*
 211  *      When we get a reset we do this. This probably is a tcp_output routine
 212  *      really.
 213  */
 214 
 215 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
 216 {
 217         sk->zapped = 1;
 218         /*
 219          *      We want the right error as BSD sees it (and indeed as we do).
 220          */
 221         sk->err = ECONNRESET;
 222         if (sk->state == TCP_SYN_SENT)
 223                 sk->err = ECONNREFUSED;
 224         if (sk->state == TCP_CLOSE_WAIT)
 225                 sk->err = EPIPE;
 226 #ifdef CONFIG_TCP_RFC1337
 227         /*
 228          *      Time wait assassination protection [RFC1337]
 229          *
 230          *      This is a good idea, but causes more sockets to take time to close.
 231          *
 232          *      Ian Heavens has since shown this is an inadequate fix for the protocol
 233          *      bug in question.
 234          */
 235         if(sk->state!=TCP_TIME_WAIT)
 236         {       
 237                 tcp_set_state(sk,TCP_CLOSE);
 238                 sk->shutdown = SHUTDOWN_MASK;
 239         }
 240 #else   
 241         tcp_set_state(sk,TCP_CLOSE);
 242         sk->shutdown = SHUTDOWN_MASK;
 243 #endif  
 244         if (!sk->dead) 
 245                 sk->state_change(sk);
 246         kfree_skb(skb, FREE_READ);
 247         return(0);
 248 }
 249 
 250 
 251 /*
 252  *      Look for tcp options. Parses everything but only knows about MSS.
 253  *      This routine is always called with the packet containing the SYN.
 254  *      However it may also be called with the ack to the SYN.  So you
 255  *      can't assume this is always the SYN.  It's always called after
 256  *      we have set up sk->mtu to our own MTU.
 257  *
 258  *      We need at minimum to add PAWS support here. Possibly large windows
 259  *      as Linux gets deployed on 100Mb/sec networks.
 260  */
 261  
 262 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
 263 {
 264         unsigned char *ptr;
 265         int length=(th->doff*4)-sizeof(struct tcphdr);
 266         int mss_seen = 0;
 267     
 268         ptr = (unsigned char *)(th + 1);
 269   
 270         while(length>0)
 271         {
 272                 int opcode=*ptr++;
 273                 int opsize=*ptr++;
 274                 switch(opcode)
 275                 {
 276                         case TCPOPT_EOL:
 277                                 return;
 278                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 279                                 length--;
 280                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
 281                                 continue;
 282                         
 283                         default:
 284                                 if(opsize<=2)   /* Avoid silly options looping forever */
 285                                         return;
 286                                 switch(opcode)
 287                                 {
 288                                         case TCPOPT_MSS:
 289                                                 if(opsize==4 && th->syn)
 290                                                 {
 291                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
 292                                                         mss_seen = 1;
 293                                                 }
 294                                                 break;
 295                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
 296                                 }
 297                                 ptr+=opsize-2;
 298                                 length-=opsize;
 299                 }
 300         }
 301         if (th->syn) 
 302         {
 303                 if (! mss_seen)
 304                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
 305         }
 306 #ifdef CONFIG_INET_PCTCP
 307         sk->mss = min(sk->max_window >> 1, sk->mtu);
 308 #else    
 309         sk->mss = min(sk->max_window, sk->mtu);
 310         sk->max_unacked = 2 * sk->mss;
 311 #endif  
 312 }
 313 
 314 
 315 /*
 316  *      This routine handles a connection request.
 317  *      It should make sure we haven't already responded.
 318  *      Because of the way BSD works, we have to send a syn/ack now.
 319  *      This also means it will be harder to close a socket which is
 320  *      listening.
 321  */
 322  
 323 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
 324                  u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
 325 {
 326         struct sock *newsk;
 327         struct tcphdr *th;
 328         struct rtable *rt;
 329   
 330         th = skb->h.th;
 331 
 332         /* If the socket is dead, don't accept the connection. */
 333         if (!sk->dead) 
 334         {
 335                 sk->data_ready(sk,0);
 336         }
 337         else 
 338         {
 339                 if(sk->debug)
 340                         printk("Reset on %p: Connect on dead socket.\n",sk);
 341                 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
 342                 tcp_statistics.TcpAttemptFails++;
 343                 kfree_skb(skb, FREE_READ);
 344                 return;
 345         }
 346 
 347         /*
 348          *      Make sure we can accept more.  This will prevent a
 349          *      flurry of syns from eating up all our memory.
 350          *
 351          *      BSD does some funnies here and allows 3/2 times the
 352          *      set backlog as a fudge factor. Thats just too gross.
 353          */
 354 
 355         if (sk->ack_backlog >= sk->max_ack_backlog) 
 356         {
 357                 tcp_statistics.TcpAttemptFails++;
 358                 kfree_skb(skb, FREE_READ);
 359                 return;
 360         }
 361 
 362         /*
 363          * We need to build a new sock struct.
 364          * It is sort of bad to have a socket without an inode attached
 365          * to it, but the wake_up's will just wake up the listening socket,
 366          * and if the listening socket is destroyed before this is taken
 367          * off of the queue, this will take care of it.
 368          */
 369 
 370         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
 371         if (newsk == NULL) 
 372         {
 373                 /* just ignore the syn.  It will get retransmitted. */
 374                 tcp_statistics.TcpAttemptFails++;
 375                 kfree_skb(skb, FREE_READ);
 376                 return;
 377         }
 378 
 379         memcpy(newsk, sk, sizeof(*newsk));
 380         newsk->opt = NULL;
 381         newsk->ip_route_cache  = NULL;
 382         if (opt && opt->optlen) 
 383         {
 384                 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
 385                 if (!sk->opt) 
 386                 {
 387                         kfree_s(newsk, sizeof(struct sock));
 388                         tcp_statistics.TcpAttemptFails++;
 389                         kfree_skb(skb, FREE_READ);
 390                         return;
 391                 }
 392                 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
 393                 {
 394                         kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
 395                         kfree_s(newsk, sizeof(struct sock));
 396                         tcp_statistics.TcpAttemptFails++;
 397                         kfree_skb(skb, FREE_READ);
 398                         return;
 399                 }
 400         }
 401         skb_queue_head_init(&newsk->write_queue);
 402         skb_queue_head_init(&newsk->receive_queue);
 403         newsk->send_head = NULL;
 404         newsk->send_tail = NULL;
 405         skb_queue_head_init(&newsk->back_log);
 406         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
 407         newsk->rto = TCP_TIMEOUT_INIT;
 408         newsk->mdev = 0;
 409         newsk->max_window = 0;
 410         newsk->cong_window = 1;
 411         newsk->cong_count = 0;
 412         newsk->ssthresh = 0;
 413         newsk->backoff = 0;
 414         newsk->blog = 0;
 415         newsk->intr = 0;
 416         newsk->proc = 0;
 417         newsk->done = 0;
 418         newsk->partial = NULL;
 419         newsk->pair = NULL;
 420         newsk->wmem_alloc = 0;
 421         newsk->rmem_alloc = 0;
 422         newsk->localroute = sk->localroute;
 423 
 424         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
 425 
 426         newsk->err = 0;
 427         newsk->shutdown = 0;
 428         newsk->ack_backlog = 0;
 429         newsk->acked_seq = skb->seq+1;
 430         newsk->lastwin_seq = skb->seq+1;
 431         newsk->delay_acks = 1;
 432         newsk->copied_seq = skb->seq+1;
 433         newsk->fin_seq = skb->seq;
 434         newsk->state = TCP_SYN_RECV;
 435         newsk->timeout = 0;
 436         newsk->ip_xmit_timeout = 0;
 437         newsk->write_seq = seq; 
 438         newsk->window_seq = newsk->write_seq;
 439         newsk->rcv_ack_seq = newsk->write_seq;
 440         newsk->urg_data = 0;
 441         newsk->retransmits = 0;
 442         newsk->linger=0;
 443         newsk->destroy = 0;
 444         init_timer(&newsk->timer);
 445         newsk->timer.data = (unsigned long)newsk;
 446         newsk->timer.function = &net_timer;
 447         init_timer(&newsk->delack_timer);
 448         newsk->delack_timer.data = (unsigned long)newsk;
 449         newsk->delack_timer.function = tcp_delack_timer;
 450         init_timer(&newsk->retransmit_timer);
 451         newsk->retransmit_timer.data = (unsigned long)newsk;
 452         newsk->retransmit_timer.function = tcp_retransmit_timer;
 453         newsk->dummy_th.source = skb->h.th->dest;
 454         newsk->dummy_th.dest = skb->h.th->source;
 455         
 456         /*
 457          *      Swap these two, they are from our point of view. 
 458          */
 459          
 460         newsk->daddr = saddr;
 461         newsk->saddr = daddr;
 462         newsk->rcv_saddr = daddr;
 463 
 464         put_sock(newsk->num,newsk);
 465         newsk->acked_seq = skb->seq + 1;
 466         newsk->copied_seq = skb->seq + 1;
 467         newsk->socket = NULL;
 468 
 469         /*
 470          *      Grab the ttl and tos values and use them 
 471          */
 472 
 473         newsk->ip_ttl=sk->ip_ttl;
 474         newsk->ip_tos=skb->ip_hdr->tos;
 475 
 476         /*
 477          *      Use 512 or whatever user asked for 
 478          */
 479 
 480         /*
 481          *      Note use of sk->user_mss, since user has no direct access to newsk 
 482          */
 483 
 484         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
 485         newsk->ip_route_cache = rt;
 486         
 487         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
 488                 newsk->window_clamp = rt->rt_window;
 489         else
 490                 newsk->window_clamp = 0;
 491                 
 492         if (sk->user_mss)
 493                 newsk->mtu = sk->user_mss;
 494         else if (rt)
 495                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 496         else 
 497                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
 498 
 499         /*
 500          *      But not bigger than device MTU 
 501          */
 502 
 503         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
 504 
 505 #ifdef CONFIG_SKIP
 506         
 507         /*
 508          *      SKIP devices set their MTU to 65535. This is so they can take packets
 509          *      unfragmented to security process then fragment. They could lie to the
 510          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
 511          *      simply because the final package we want unfragmented is going to be
 512          *
 513          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
 514          */
 515          
 516         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
 517                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
 518 #endif
 519         /*
 520          *      This will min with what arrived in the packet 
 521          */
 522 
 523         tcp_options(newsk,skb->h.th);
 524         
 525         tcp_cache_zap();
 526         tcp_send_synack(newsk, sk, skb);
 527 }
 528 
 529 
 530 /*
 531  * Handle a TCP window that shrunk on us. It shouldn't happen,
 532  * but..
 533  *
 534  * We may need to move packets from the send queue
 535  * to the write queue, if the window has been shrunk on us.
 536  * The RFC says you are not allowed to shrink your window
 537  * like this, but if the other end does, you must be able
 538  * to deal with it.
 539  */
 540 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
     /*  */
 541 {
 542         struct sk_buff *skb;
 543         struct sk_buff *skb2;
 544         struct sk_buff *wskb = NULL;
 545         
 546         skb2 = sk->send_head;
 547         sk->send_head = NULL;
 548         sk->send_tail = NULL;
 549 
 550         /*
 551          *      This is an artifact of a flawed concept. We want one
 552          *      queue and a smarter send routine when we send all.
 553          */
 554         cli();
 555         while (skb2 != NULL) 
 556         {
 557                 skb = skb2;
 558                 skb2 = skb->link3;
 559                 skb->link3 = NULL;
 560                 if (after(skb->end_seq, window_seq)) 
 561                 {
 562                         if (sk->packets_out > 0) 
 563                                 sk->packets_out--;
 564                         /* We may need to remove this from the dev send list. */
 565                         if (skb->next != NULL) 
 566                         {
 567                                 skb_unlink(skb);                                
 568                         }
 569                         /* Now add it to the write_queue. */
 570                         if (wskb == NULL)
 571                                 skb_queue_head(&sk->write_queue,skb);
 572                         else
 573                                 skb_append(wskb,skb);
 574                         wskb = skb;
 575                 } 
 576                 else 
 577                 {
 578                         if (sk->send_head == NULL) 
 579                         {
 580                                 sk->send_head = skb;
 581                                 sk->send_tail = skb;
 582                         }
 583                         else
 584                         {
 585                                 sk->send_tail->link3 = skb;
 586                                 sk->send_tail = skb;
 587                         }
 588                         skb->link3 = NULL;
 589                 }
 590         }
 591         sti();
 592 }
 593 
 594 
 595 /*
 596  *      This routine deals with incoming acks, but not outgoing ones.
 597  *
 598  *      This routine is totally _WRONG_. The list structuring is wrong,
 599  *      the algorithm is wrong, the code is wrong.
 600  */
 601 
 602 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
     /*  */
 603 {
 604         int flag = 0;
 605         u32 window_seq;
 606 
 607         /* 
 608          * 1 - there was data in packet as well as ack or new data is sent or 
 609          *     in shutdown state
 610          * 2 - data from retransmit queue was acked and removed
 611          * 4 - window shrunk or data from retransmit queue was acked and removed
 612          * 8 - we want to do a fast retransmit. One packet only.
 613          */
 614 
 615         if(sk->zapped)
 616                 return(1);      /* Dead, cant ack any more so why bother */
 617 
 618         /*
 619          *      We have dropped back to keepalive timeouts. Thus we have
 620          *      no retransmits pending.
 621          */
 622          
 623         if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
 624                 sk->retransmits = 0;
 625 
 626         /*
 627          *      If the ack is newer than sent or older than previous acks
 628          *      then we can probably ignore it.
 629          */
 630          
 631         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
 632                 goto uninteresting_ack;
 633 
 634         /*
 635          *      If there is data set flag 1
 636          */
 637          
 638         if (len != th->doff*4) 
 639                 flag |= 1;
 640 
 641         /*
 642          *      Have we discovered a larger window
 643          */
 644         window_seq = ntohs(th->window);
 645         if (window_seq > sk->max_window) 
 646         {
 647                 sk->max_window = window_seq;
 648 #ifdef CONFIG_INET_PCTCP
 649                 /* Hack because we don't send partial packets to non SWS
 650                    handling hosts */
 651                 sk->mss = min(window_seq>>1, sk->mtu);
 652 #else
 653                 sk->mss = min(window_seq, sk->mtu);
 654 #endif  
 655         }
 656         window_seq += ack;
 657 
 658         /*
 659          *      See if our window has been shrunk. 
 660          */
 661         if (after(sk->window_seq, window_seq)) {
 662                 flag |= 4;
 663                 tcp_window_shrunk(sk, window_seq);
 664         }
 665 
 666         /*
 667          *      Pipe has emptied
 668          */      
 669         if (sk->send_tail == NULL || sk->send_head == NULL) 
 670         {
 671                 sk->send_head = NULL;
 672                 sk->send_tail = NULL;
 673                 sk->packets_out= 0;
 674         }
 675 
 676         /*
 677          *      We don't want too many packets out there. 
 678          */
 679          
 680         if (sk->ip_xmit_timeout == TIME_WRITE && 
 681                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
 682         {
 683                 
 684                 /* 
 685                  * This is Jacobson's slow start and congestion avoidance. 
 686                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
 687                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
 688                  * counter and increment it once every cwnd times.  It's possible
 689                  * that this should be done only if sk->retransmits == 0.  I'm
 690                  * interpreting "new data is acked" as including data that has
 691                  * been retransmitted but is just now being acked.
 692                  */
 693                 if (sk->cong_window < sk->ssthresh)  
 694                         /* 
 695                          *      In "safe" area, increase
 696                          */
 697                         sk->cong_window++;
 698                 else 
 699                 {
 700                         /*
 701                          *      In dangerous area, increase slowly.  In theory this is
 702                          *      sk->cong_window += 1 / sk->cong_window
 703                          */
 704                         if (sk->cong_count >= sk->cong_window) 
 705                         {
 706                                 sk->cong_window++;
 707                                 sk->cong_count = 0;
 708                         }
 709                         else 
 710                                 sk->cong_count++;
 711                 }
 712         }
 713 
 714         /*
 715          *      Remember the highest ack received and update the
 716          *      right hand window edge of the host.
 717          *      We do a bit of work here to track number of times we've
 718          *      seen this ack without a change in the right edge of the
 719          *      window. This will allow us to do fast retransmits.
 720          */
 721 
 722         if (sk->rcv_ack_seq == ack && sk->window_seq == window_seq)
 723         {
 724                 /*
 725                  * We only want to short cut this once, many
 726                  * ACKs may still come, we'll do a normal transmit
 727                  * for these ACKs.
 728                  */
 729                 if (++sk->rcv_ack_cnt == MAX_DUP_ACKS+1)
 730                         flag |= 8;      /* flag for a fast retransmit */
 731         }
 732         else
 733         {
 734                 sk->window_seq = window_seq;
 735                 sk->rcv_ack_seq = ack;
 736                 sk->rcv_ack_cnt = 1;
 737         }
 738         
 739         /*
 740          *      We passed data and got it acked, remove any soft error
 741          *      log. Something worked...
 742          */
 743          
 744         sk->err_soft = 0;
 745 
 746         /*
 747          *      If this ack opens up a zero window, clear backoff.  It was
 748          *      being used to time the probes, and is probably far higher than
 749          *      it needs to be for normal retransmission.
 750          */
 751 
 752         if (sk->ip_xmit_timeout == TIME_PROBE0) 
 753         {
 754                 sk->retransmits = 0;    /* Our probe was answered */
 755                 
 756                 /*
 757                  *      Was it a usable window open ?
 758                  */
 759                  
 760                 if (!skb_queue_empty(&sk->write_queue) &&   /* should always be true */
 761                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
 762                 {
 763                         sk->backoff = 0;
 764                         
 765                         /*
 766                          *      Recompute rto from rtt.  this eliminates any backoff.
 767                          */
 768 
 769                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 770                         if (sk->rto > 120*HZ)
 771                                 sk->rto = 120*HZ;
 772                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
 773                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
 774                                                    .2 of a second is going to need huge windows (SIGH) */
 775                         sk->rto = HZ/5;
 776                 }
 777         }
 778 
 779         /* 
 780          *      See if we can take anything off of the retransmit queue.
 781          */
 782 
 783         for (;;) {
 784                 struct sk_buff * skb = sk->send_head;
 785                 if (!skb)
 786                         break;
 787 
 788                 /* Check for a bug. */
 789                 if (skb->link3 && after(skb->end_seq, skb->link3->end_seq)) 
 790                         printk("INET: tcp.c: *** bug send_list out of order.\n");
 791                         
 792                 /*
 793                  *      If our packet is before the ack sequence we can
 794                  *      discard it as it's confirmed to have arrived the other end.
 795                  */
 796                  
 797                 if (after(skb->end_seq, ack))
 798                         break;
 799 
 800                 if (sk->retransmits) 
 801                 {       
 802                         /*
 803                          *      We were retransmitting.  don't count this in RTT est 
 804                          */
 805                         flag |= 2;
 806                 }
 807 
 808                 if ((sk->send_head = skb->link3) == NULL)
 809                 {
 810                         sk->send_tail = NULL;
 811                         sk->retransmits = 0;
 812                 }
 813                 /*
 814                  * Note that we only reset backoff and rto in the
 815                  * rtt recomputation code.  And that doesn't happen
 816                  * if there were retransmissions in effect.  So the
 817                  * first new packet after the retransmissions is
 818                  * sent with the backoff still in effect.  Not until
 819                  * we get an ack from a non-retransmitted packet do
 820                  * we reset the backoff and rto.  This allows us to deal
 821                  * with a situation where the network delay has increased
 822                  * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 823                  */
 824 
 825                 /*
 826                  *      We have one less packet out there. 
 827                  */
 828                          
 829                 if (sk->packets_out > 0) 
 830                         sk->packets_out --;
 831 
 832                 if (!(flag&2))  /* Not retransmitting */
 833                         tcp_rtt_estimator(sk,skb);
 834                 flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
 835                                    In this case as we just set it up */
 836                 IS_SKB(skb);
 837 
 838                 /*
 839                  *      We may need to remove this from the dev send list. 
 840                  */
 841                 cli();
 842                 if (skb->next)
 843                         skb_unlink(skb);
 844                 sti();
 845                 kfree_skb(skb, FREE_WRITE); /* write. */
 846                 if (!sk->dead)
 847                         sk->write_space(sk);
 848         }
 849 
 850         /*
 851          * XXX someone ought to look at this too.. at the moment, if skb_peek()
 852          * returns non-NULL, we complete ignore the timer stuff in the else
 853          * clause.  We ought to organize the code so that else clause can
 854          * (should) be executed regardless, possibly moving the PROBE timer
 855          * reset over.  The skb_peek() thing should only move stuff to the
 856          * write queue, NOT also manage the timer functions.
 857          */
 858 
 859         /*
 860          * Maybe we can take some stuff off of the write queue,
 861          * and put it onto the xmit queue.
 862          */
 863         if (skb_peek(&sk->write_queue) != NULL) 
 864         {
 865                 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
 866                         (sk->retransmits == 0 || 
 867                          sk->ip_xmit_timeout != TIME_WRITE ||
 868                          !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
 869                         && sk->packets_out < sk->cong_window) 
 870                 {
 871                         /*
 872                          *      Add more data to the send queue.
 873                          */
 874                         flag |= 1;
 875                         tcp_write_xmit(sk);
 876                 }
 877                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
 878                         sk->send_head == NULL &&
 879                         sk->ack_backlog == 0 &&
 880                         sk->state != TCP_TIME_WAIT) 
 881                 {
 882                         /*
 883                          *      Data to queue but no room.
 884                          */
 885                         tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
 886                 }               
 887         }
 888         else
 889         {
 890                 /*
 891                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
 892                  * from TCP_CLOSE we don't do anything
 893                  *
 894                  * from anything else, if there is write data (or fin) pending,
 895                  * we use a TIME_WRITE timeout, else if keepalive we reset to
 896                  * a KEEPALIVE timeout, else we delete the timer.
 897                  *
 898                  * We do not set flag for nominal write data, otherwise we may
 899                  * force a state where we start to write itsy bitsy tidbits
 900                  * of data.
 901                  */
 902 
 903                 switch(sk->state) {
 904                 case TCP_TIME_WAIT:
 905                         /*
 906                          * keep us in TIME_WAIT until we stop getting packets,
 907                          * reset the timeout.
 908                          */
 909                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 910                         break;
 911                 case TCP_CLOSE:
 912                         /*
 913                          * don't touch the timer.
 914                          */
 915                         break;
 916                 default:
 917                         /*
 918                          * Reset the xmit timer - state has changed.
 919                          */
 920                         tcp_reset_xmit_timer(sk, 0, 0);
 921                         break;
 922                 }
 923         }
 924 
 925         /*
 926          *      We have nothing queued but space to send. Send any partial
 927          *      packets immediately (end of Nagle rule application).
 928          */
 929          
 930         if (sk->packets_out == 0
 931             && sk->partial != NULL
 932             && skb_queue_empty(&sk->write_queue)
 933             && sk->send_head == NULL) 
 934         {
 935                 flag |= 1;
 936                 tcp_send_partial(sk);
 937         }
 938 
 939         /*
 940          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
 941          * we are now waiting for an acknowledge to our FIN.  The other end is
 942          * already in TIME_WAIT.
 943          *
 944          * Move to TCP_CLOSE on success.
 945          */
 946 
 947         if (sk->state == TCP_LAST_ACK) 
 948         {
 949                 if (!sk->dead)
 950                         sk->state_change(sk);
 951                 if(sk->debug)
 952                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
 953                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
 954                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
 955                 {
 956                         flag |= 1;
 957                         sk->shutdown = SHUTDOWN_MASK;
 958                         tcp_set_state(sk,TCP_CLOSE);
 959                         return 1;
 960                 }
 961         }
 962 
 963         /*
 964          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
 965          *
 966          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
 967          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
 968          */
 969 
 970         if (sk->state == TCP_FIN_WAIT1) 
 971         {
 972 
 973                 if (!sk->dead) 
 974                         sk->state_change(sk);
 975                 if (sk->rcv_ack_seq == sk->write_seq) 
 976                 {
 977                         flag |= 1;
 978                         sk->shutdown |= SEND_SHUTDOWN;
 979                         tcp_set_state(sk, TCP_FIN_WAIT2);
 980                 }
 981         }
 982 
 983         /*
 984          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
 985          *
 986          *      Move to TIME_WAIT
 987          */
 988 
 989         if (sk->state == TCP_CLOSING) 
 990         {
 991 
 992                 if (!sk->dead) 
 993                         sk->state_change(sk);
 994                 if (sk->rcv_ack_seq == sk->write_seq) 
 995                 {
 996                         flag |= 1;
 997                         tcp_time_wait(sk);
 998                 }
 999         }
1000         
1001         /*
1002          *      Final ack of a three way shake 
1003          */
1004          
1005         if(sk->state==TCP_SYN_RECV)
1006         {
1007                 tcp_set_state(sk, TCP_ESTABLISHED);
1008                 tcp_options(sk,th);
1009                 sk->dummy_th.dest=th->source;
1010                 sk->copied_seq = sk->acked_seq;
1011                 if(!sk->dead)
1012                         sk->state_change(sk);
1013                 if(sk->max_window==0)
1014                 {
1015                         sk->max_window=32;      /* Sanity check */
1016                         sk->mss=min(sk->max_window,sk->mtu);
1017                 }
1018         }
1019         
1020         /*
1021          * I make no guarantees about the first clause in the following
1022          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
1023          * what conditions "!flag" would be true.  However I think the rest
1024          * of the conditions would prevent that from causing any
1025          * unnecessary retransmission. 
1026          *   Clearly if the first packet has expired it should be 
1027          * retransmitted.  The other alternative, "flag&2 && retransmits", is
1028          * harder to explain:  You have to look carefully at how and when the
1029          * timer is set and with what timeout.  The most recent transmission always
1030          * sets the timer.  So in general if the most recent thing has timed
1031          * out, everything before it has as well.  So we want to go ahead and
1032          * retransmit some more.  If we didn't explicitly test for this
1033          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1034          * would not be true.  If you look at the pattern of timing, you can
1035          * show that rto is increased fast enough that the next packet would
1036          * almost never be retransmitted immediately.  Then you'd end up
1037          * waiting for a timeout to send each packet on the retransmission
1038          * queue.  With my implementation of the Karn sampling algorithm,
1039          * the timeout would double each time.  The net result is that it would
1040          * take a hideous amount of time to recover from a single dropped packet.
1041          * It's possible that there should also be a test for TIME_WRITE, but
1042          * I think as long as "send_head != NULL" and "retransmit" is on, we've
1043          * got to be in real retransmission mode.
1044          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
1045          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1046          * As long as no further losses occur, this seems reasonable.
1047          */
1048         
1049         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1050                (((flag&2) && sk->retransmits) ||
1051                 (flag&8) ||
1052                (sk->send_head->when + sk->rto < jiffies))) 
1053         {
1054                 if(sk->send_head->when + sk->rto < jiffies)
1055                         tcp_retransmit(sk,0);   
1056                 else
1057                 {
1058                         tcp_do_retransmit(sk, 1);
1059                         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1060                 }
1061         }
1062 
1063         return 1;
1064 
1065 uninteresting_ack:
1066         if(sk->debug)
1067                 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1068                         
1069         /*
1070          *      Keepalive processing.
1071          */
1072                  
1073         if (after(ack, sk->sent_seq)) 
1074         {
1075                 return 0;
1076         }
1077                 
1078         /*
1079          *      Restart the keepalive timer.
1080          */
1081                  
1082         if (sk->keepopen) 
1083         {
1084                 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1085                         tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1086         }
1087         return 1;
1088 }
1089 
1090 
1091 /*
1092  *      Process the FIN bit. This now behaves as it is supposed to work
1093  *      and the FIN takes effect when it is validly part of sequence
1094  *      space. Not before when we get holes.
1095  *
1096  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1097  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1098  *      TIME-WAIT)
1099  *
1100  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1101  *      close and we go into CLOSING (and later onto TIME-WAIT)
1102  *
1103  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1104  *
1105  */
1106  
1107 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
1108 {
1109         sk->fin_seq = skb->end_seq;
1110 
1111         if (!sk->dead) 
1112         {
1113                 sk->state_change(sk);
1114                 sock_wake_async(sk->socket, 1);
1115         }
1116 
1117         switch(sk->state) 
1118         {
1119                 case TCP_SYN_RECV:
1120                 case TCP_SYN_SENT:
1121                 case TCP_ESTABLISHED:
1122                         /*
1123                          * move to CLOSE_WAIT, tcp_data() already handled
1124                          * sending the ack.
1125                          */
1126                         tcp_set_state(sk,TCP_CLOSE_WAIT);
1127                         if (th->rst)
1128                                 sk->shutdown = SHUTDOWN_MASK;
1129                         break;
1130 
1131                 case TCP_CLOSE_WAIT:
1132                 case TCP_CLOSING:
1133                         /*
1134                          * received a retransmission of the FIN, do
1135                          * nothing.
1136                          */
1137                         break;
1138                 case TCP_TIME_WAIT:
1139                         /*
1140                          * received a retransmission of the FIN,
1141                          * restart the TIME_WAIT timer.
1142                          */
1143                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1144                         return(0);
1145                 case TCP_FIN_WAIT1:
1146                         /*
1147                          * This case occurs when a simultaneous close
1148                          * happens, we must ack the received FIN and
1149                          * enter the CLOSING state.
1150                          *
1151                          * This causes a WRITE timeout, which will either
1152                          * move on to TIME_WAIT when we timeout, or resend
1153                          * the FIN properly (maybe we get rid of that annoying
1154                          * FIN lost hang). The TIME_WRITE code is already correct
1155                          * for handling this timeout.
1156                          */
1157 
1158                         if(sk->ip_xmit_timeout != TIME_WRITE)
1159                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1160                         tcp_set_state(sk,TCP_CLOSING);
1161                         break;
1162                 case TCP_FIN_WAIT2:
1163                         /*
1164                          * received a FIN -- send ACK and enter TIME_WAIT
1165                          */
1166                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1167                         sk->shutdown|=SHUTDOWN_MASK;
1168                         tcp_set_state(sk,TCP_TIME_WAIT);
1169                         break;
1170                 case TCP_CLOSE:
1171                         /*
1172                          * already in CLOSE
1173                          */
1174                         break;
1175                 default:
1176                         tcp_set_state(sk,TCP_LAST_ACK);
1177         
1178                         /* Start the timers. */
1179                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1180                         return(0);
1181         }
1182 
1183         return(0);
1184 }
1185 
1186 /*
1187  * Add a sk_buff to the TCP receive queue, calculating
1188  * the ACK sequence as we go..
1189  */
1190 static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list)
     /*  */
1191 {
1192         struct sk_buff * prev, * next;
1193         u32 seq;
1194 
1195         /*
1196          * Find where the new skb goes.. (This goes backwards,
1197          * on the assumption that we get the packets in order)
1198          */
1199         seq = skb->seq;
1200         prev = list->prev;
1201         next = (struct sk_buff *) list;
1202         for (;;) {
1203                 if (prev == (struct sk_buff *) list || !after(prev->seq, seq))
1204                         break;
1205                 next = prev;
1206                 prev = prev->prev;
1207         }
1208         __skb_insert(skb, prev, next, list);
1209 }
1210 
1211 /*
1212  * Called for each packet when we find a new ACK endpoint sequence in it
1213  */
1214 static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
     /*  */
1215 {
1216         /*
1217          *      When we ack the fin, we do the FIN 
1218          *      processing.
1219          */
1220         skb->acked = 1;
1221         if (skb->h.th->fin)
1222                 tcp_fin(skb,sk,skb->h.th);
1223         return skb->end_seq;
1224 }       
1225 
1226 static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th)
     /*  */
1227 {
1228         u32 ack_seq;
1229 
1230         tcp_insert_skb(skb, &sk->receive_queue);
1231 
1232         /*
1233          * Did we get anything new to ack?
1234          */
1235         ack_seq = sk->acked_seq;
1236 
1237 
1238         if (!after(skb->seq, ack_seq)) {
1239                 if (after(skb->end_seq, ack_seq)) {
1240                         /* the packet stradles our window end */
1241                         struct sk_buff_head * list = &sk->receive_queue;
1242                         struct sk_buff * next;
1243                         ack_seq = tcp_queue_ack(skb, sk);
1244 
1245                         /*
1246                          * Do we have any old packets to ack that the above
1247                          * made visible? (Go forward from skb)
1248                          */
1249                         next = skb->next;
1250                         while (next != (struct sk_buff *) list) {
1251                                 if (after(next->seq, ack_seq))
1252                                         break;
1253                                 if (after(next->end_seq, ack_seq))
1254                                         ack_seq = tcp_queue_ack(next, sk);
1255                                 next = next->next;
1256                         }
1257 
1258                         /*
1259                          * Ok, we found new data, update acked_seq as
1260                          * necessary (and possibly send the actual
1261                          * ACK packet).
1262                          */
1263                         sk->acked_seq = ack_seq;
1264 
1265                 } else {
1266                         if (sk->debug)
1267                                 printk("Ack duplicate packet.\n");
1268                         tcp_send_ack(sk);
1269                         return;
1270                 }
1271 
1272 
1273                 /*
1274                  * Delay the ack if possible.  Send ack's to
1275                  * fin frames immediately as there shouldn't be
1276                  * anything more to come.
1277                  */
1278                 if (!sk->delay_acks || th->fin) {
1279                         tcp_send_ack(sk);
1280                 } else {
1281                         /*
1282                          * If psh is set we assume it's an
1283                          * interactive session that wants quick
1284                          * acks to avoid nagling too much. 
1285                          */
1286                         int delay = HZ/2;
1287                         if (th->psh)
1288                                 delay = HZ/10;
1289                         tcp_send_delayed_ack(sk, delay);
1290                 }
1291 
1292                 /*
1293                  *      Tell the user we have some more data.
1294                  */
1295 
1296                 if (!sk->dead)
1297                         sk->data_ready(sk,0);
1298 
1299         }
1300         else
1301         {
1302             /*
1303              *  If we've missed a packet, send an ack.
1304              *  Also start a timer to send another.
1305              *
1306              *  4.3reno machines look for these kind of acks so
1307              *  they can do fast recovery. Three identical 'old'
1308              *  acks lets it know that one frame has been lost
1309              *      and should be resent. Because this is before the
1310              *  whole window of data has timed out it can take
1311              *  one lost frame per window without stalling.
1312              *  [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
1313              *
1314              *  We also should be spotting triple bad sequences.
1315              *  [We now do this.]
1316              *
1317              */
1318              
1319             if (!skb->acked) 
1320             {
1321                     if(sk->debug)
1322                             printk("Ack past end of seq packet.\n");
1323                     tcp_send_ack(sk);
1324                     sk->ack_backlog++;
1325                     tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1326             }
1327         }
1328 }
1329 
1330 
1331 /*
1332  *      This routine handles the data.  If there is room in the buffer,
1333  *      it will be have already been moved into it.  If there is no
1334  *      room, then we will just have to discard the packet.
1335  */
1336 
1337 static int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
1338          unsigned long saddr, unsigned short len)
1339 {
1340         struct tcphdr *th;
1341         u32 new_seq, shut_seq;
1342 
1343         th = skb->h.th;
1344         skb_pull(skb,th->doff*4);
1345         skb_trim(skb,len-(th->doff*4));
1346 
1347         /*
1348          *      The bytes in the receive read/assembly queue has increased. Needed for the
1349          *      low memory discard algorithm 
1350          */
1351            
1352         sk->bytes_rcv += skb->len;
1353         
1354         if (skb->len == 0 && !th->fin) 
1355         {
1356                 /* 
1357                  *      Don't want to keep passing ack's back and forth. 
1358                  *      (someone sent us dataless, boring frame)
1359                  */
1360                 if (!th->ack)
1361                         tcp_send_ack(sk);
1362                 kfree_skb(skb, FREE_READ);
1363                 return(0);
1364         }
1365         
1366         /*
1367          *      We no longer have anyone receiving data on this connection.
1368          */
1369 
1370 #ifndef TCP_DONT_RST_SHUTDOWN            
1371 
1372         if(sk->shutdown & RCV_SHUTDOWN)
1373         {
1374                 /*
1375                  *      FIXME: BSD has some magic to avoid sending resets to
1376                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
1377                  *      BSD stacks still have broken keepalives so we want to
1378                  *      cope with it.
1379                  */
1380 
1381                 if(skb->len)    /* We don't care if it's just an ack or
1382                                    a keepalive/window probe */
1383                 {
1384                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
1385                         
1386                         /* Do this the way 4.4BSD treats it. Not what I'd
1387                            regard as the meaning of the spec but it's what BSD
1388                            does and clearly they know everything 8) */
1389 
1390                         /*
1391                          *      This is valid because of two things
1392                          *
1393                          *      a) The way tcp_data behaves at the bottom.
1394                          *      b) A fin takes effect when read not when received.
1395                          */
1396                          
1397                         shut_seq = sk->acked_seq+1;     /* Last byte */
1398                         
1399                         if(after(new_seq,shut_seq))
1400                         {
1401                                 if(sk->debug)
1402                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1403                                                 sk, new_seq, shut_seq, sk->blog);
1404                                 if(sk->dead)
1405                                 {
1406                                         sk->acked_seq = new_seq + th->fin;
1407                                         tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1408                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1409                                         tcp_statistics.TcpEstabResets++;
1410                                         sk->err = EPIPE;
1411                                         sk->error_report(sk);
1412                                         sk->shutdown = SHUTDOWN_MASK;
1413                                         tcp_set_state(sk,TCP_CLOSE);
1414                                         kfree_skb(skb, FREE_READ);
1415                                         return 0;
1416                                 }
1417                         }
1418                 }
1419         }
1420 
1421 #endif
1422 
1423         tcp_queue(skb, sk, th);
1424 
1425         return(0);
1426 }
1427 
1428 
1429 /*
1430  *      This routine is only called when we have urgent data
1431  *      signalled. Its the 'slow' part of tcp_urg. It could be
1432  *      moved inline now as tcp_urg is only called from one
1433  *      place. We handle URGent data wrong. We have to - as
1434  *      BSD still doesn't use the correction from RFC961.
1435  */
1436  
1437 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
1438 {
1439         u32 ptr = ntohs(th->urg_ptr);
1440 
1441         if (ptr)
1442                 ptr--;
1443         ptr += ntohl(th->seq);
1444 
1445         /* ignore urgent data that we've already seen and read */
1446         if (after(sk->copied_seq, ptr))
1447                 return;
1448 
1449         /* do we already have a newer (or duplicate) urgent pointer? */
1450         if (sk->urg_data && !after(ptr, sk->urg_seq))
1451                 return;
1452 
1453         /* tell the world about our new urgent pointer */
1454         if (sk->proc != 0) {
1455                 if (sk->proc > 0) {
1456                         kill_proc(sk->proc, SIGURG, 1);
1457                 } else {
1458                         kill_pg(-sk->proc, SIGURG, 1);
1459                 }
1460         }
1461         sk->urg_data = URG_NOTYET;
1462         sk->urg_seq = ptr;
1463 }
1464 
1465 /*
1466  *      This is the 'fast' part of urgent handling.
1467  */
1468  
1469 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
     /*  */
1470 {
1471         /*
1472          *      Check if we get a new urgent pointer - normally not 
1473          */
1474          
1475         if (th->urg)
1476                 tcp_check_urg(sk,th);
1477 
1478         /*
1479          *      Do we wait for any urgent data? - normally not
1480          */
1481          
1482         if (sk->urg_data == URG_NOTYET) {
1483                 u32 ptr;
1484 
1485                 /*
1486                  *      Is the urgent pointer pointing into this packet? 
1487                  */      
1488                 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1489                 if (ptr < len) {
1490                         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1491                         if (!sk->dead)
1492                                 sk->data_ready(sk,0);
1493                 }
1494         }
1495 }
1496 
1497 /*
1498  * This should be a bit smarter and remove partially
1499  * overlapping stuff too, but this should be good
1500  * enough for any even remotely normal case (and the
1501  * worst that can happen is that we have a few
1502  * unnecessary packets in the receive queue).
1503  *
1504  * This function is never called with an empty list..
1505  */
1506 static inline void tcp_remove_dups(struct sk_buff_head * list)
     /*  */
1507 {
1508         struct sk_buff * next = list->next;
1509 
1510         for (;;) {
1511                 struct sk_buff * skb = next;
1512                 next = next->next;
1513                 if (next == (struct sk_buff *) list)
1514                         break;
1515                 if (before(next->end_seq, skb->end_seq)) {
1516                         __skb_unlink(next, list);
1517                         kfree_skb(next, FREE_READ);
1518                         next = skb;
1519                         continue;
1520                 }
1521                 if (next->seq != skb->seq)
1522                         continue;
1523                 __skb_unlink(skb, list);
1524                 kfree_skb(skb, FREE_READ);
1525         }
1526 }
1527 
1528 /*
1529  * Throw out all unnecessary packets: we've gone over the
1530  * receive queue limit. This shouldn't happen in a normal
1531  * TCP connection, but we might have gotten duplicates etc.
1532  */
1533 static void prune_queue(struct sk_buff_head * list)
     /*  */
1534 {
1535         for (;;) {
1536                 struct sk_buff * skb = list->prev;
1537 
1538                 /* gone through it all? */
1539                 if (skb == (struct sk_buff *) list)
1540                         break;
1541                 if (!skb->acked) {
1542                         __skb_unlink(skb, list);
1543                         kfree_skb(skb, FREE_READ);
1544                         continue;
1545                 }
1546                 tcp_remove_dups(list);
1547                 break;
1548         }
1549 }
1550 
1551 /*
1552  *      A TCP packet has arrived.
1553  *              skb->h.raw is the TCP header.
1554  */
1555  
1556 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
1557         __u32 daddr, unsigned short len,
1558         __u32 saddr, int redo, struct inet_protocol * protocol)
1559 {
1560         struct tcphdr *th;
1561         struct sock *sk;
1562         int syn_ok=0;
1563 
1564         /*
1565          * "redo" is 1 if we have already seen this skb but couldn't
1566          * use it at that time (the socket was locked).  In that case
1567          * we have already done a lot of the work (looked up the socket
1568          * etc).
1569          */
1570         th = skb->h.th;
1571         sk = skb->sk;
1572         if (!redo) {
1573                 tcp_statistics.TcpInSegs++;
1574                 if (skb->pkt_type!=PACKET_HOST)
1575                         goto discard_it;
1576 
1577                 /*
1578                  *      Pull up the IP header.
1579                  */
1580         
1581                 skb_pull(skb, skb->h.raw-skb->data);
1582 
1583                 /*
1584                  *      Try to use the device checksum if provided.
1585                  */
1586                 switch (skb->ip_summed) 
1587                 {
1588                         case CHECKSUM_NONE:
1589                                 skb->csum = csum_partial((char *)th, len, 0);
1590                         case CHECKSUM_HW:
1591                                 if (tcp_check(th, len, saddr, daddr, skb->csum))
1592                                         goto discard_it;
1593                         default:
1594                                 /* CHECKSUM_UNNECESSARY */
1595                 }
1596                 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1597                 if (!sk)
1598                         goto no_tcp_socket;
1599                 skb->sk = sk;
1600                 skb->seq = ntohl(th->seq);
1601                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1602                 skb->ack_seq = ntohl(th->ack_seq);
1603 
1604                 skb->acked = 0;
1605                 skb->used = 0;
1606                 skb->free = 1;
1607                 skb->saddr = daddr;
1608                 skb->daddr = saddr;
1609 
1610                 /*
1611                  * We may need to add it to the backlog here. 
1612                  */
1613                 if (sk->users) 
1614                 {
1615                         __skb_queue_tail(&sk->back_log, skb);
1616                         return(0);
1617                 }
1618         }
1619 
1620         /*
1621          *      If this socket has got a reset it's to all intents and purposes 
1622          *      really dead. Count closed sockets as dead.
1623          *
1624          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1625          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
1626          *      exist so should cause resets as if the port was unreachable.
1627          */
1628 
1629         if (sk->zapped || sk->state==TCP_CLOSE)
1630                 goto no_tcp_socket;
1631 
1632         if (!sk->prot) 
1633         {
1634                 printk("IMPOSSIBLE 3\n");
1635                 return(0);
1636         }
1637 
1638 
1639         /*
1640          *      Charge the memory to the socket. 
1641          */
1642          
1643         skb->sk=sk;
1644         atomic_add(skb->truesize, &sk->rmem_alloc);
1645         
1646         /*
1647          *      We should now do header prediction.
1648          */
1649          
1650         /*
1651          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1652          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1653          *      compatibility. We also set up variables more thoroughly [Karn notes in the
1654          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1655          */
1656 
1657         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
1658         {
1659         
1660                 /*
1661                  *      Now deal with unusual cases.
1662                  */
1663          
1664                 if(sk->state==TCP_LISTEN)
1665                 {
1666                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
1667                                 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1668 
1669                         /*
1670                          *      We don't care for RST, and non SYN are absorbed (old segments)
1671                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1672                          *      netmask on a running connection it can go broadcast. Even Sun's have
1673                          *      this problem so I'm ignoring it 
1674                          */
1675                            
1676                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1677                         {
1678                                 kfree_skb(skb, FREE_READ);
1679                                 return 0;
1680                         }
1681                 
1682                         /*      
1683                          *      Guess we need to make a new socket up 
1684                          */
1685                 
1686                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1687                 
1688                         /*
1689                          *      Now we have several options: In theory there is nothing else
1690                          *      in the frame. KA9Q has an option to send data with the syn,
1691                          *      BSD accepts data with the syn up to the [to be] advertised window
1692                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
1693                          *      it, that fits the spec precisely and avoids incompatibilities. It
1694                          *      would be nice in future to drop through and process the data.
1695                          *
1696                          *      Now TTCP is starting to use we ought to queue this data.
1697                          */
1698                          
1699                         return 0;
1700                 }
1701         
1702                 /* 
1703                  *      Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1704                  *      then its a new connection
1705                  */
1706                  
1707                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1708                 {
1709                         kfree_skb(skb, FREE_READ);
1710                         return 0;
1711                 }
1712                 
1713                 /*
1714                  *      SYN sent means we have to look for a suitable ack and either reset
1715                  *      for bad matches or go to connected. The SYN_SENT case is unusual and should
1716                  *      not be in line code. [AC]
1717                  */
1718            
1719                 if(sk->state==TCP_SYN_SENT)
1720                 {
1721                         /* Crossed SYN or previous junk segment */
1722                         if(th->ack)
1723                         {
1724                                 /* We got an ack, but it's not a good ack */
1725                                 if(!tcp_ack(sk,th,skb->ack_seq,len))
1726                                 {
1727                                         /* Reset the ack - its an ack from a 
1728                                            different connection  [ th->rst is checked in tcp_send_reset()] */
1729                                         tcp_statistics.TcpAttemptFails++;
1730                                         tcp_send_reset(daddr, saddr, th,
1731                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1732                                         kfree_skb(skb, FREE_READ);
1733                                         return(0);
1734                                 }
1735                                 if(th->rst)
1736                                         return tcp_reset(sk,skb);
1737                                 if(!th->syn)
1738                                 {
1739                                         /* A valid ack from a different connection
1740                                            start. Shouldn't happen but cover it */
1741                                         tcp_statistics.TcpAttemptFails++;
1742                                         tcp_send_reset(daddr, saddr, th,
1743                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1744                                         kfree_skb(skb, FREE_READ);
1745                                         return 0;
1746                                 }
1747                                 /*
1748                                  *      Ok.. it's good. Set up sequence numbers and
1749                                  *      move to established.
1750                                  */
1751                                 syn_ok=1;       /* Don't reset this connection for the syn */
1752                                 sk->acked_seq = skb->seq+1;
1753                                 sk->lastwin_seq = skb->seq+1;
1754                                 sk->fin_seq = skb->seq;
1755                                 tcp_send_ack(sk);
1756                                 tcp_set_state(sk, TCP_ESTABLISHED);
1757                                 tcp_options(sk,th);
1758                                 sk->dummy_th.dest=th->source;
1759                                 sk->copied_seq = sk->acked_seq;
1760                                 if(!sk->dead)
1761                                 {
1762                                         sk->state_change(sk);
1763                                         sock_wake_async(sk->socket, 0);
1764                                 }
1765                                 if(sk->max_window==0)
1766                                 {
1767                                         sk->max_window = 32;
1768                                         sk->mss = min(sk->max_window, sk->mtu);
1769                                 }
1770                         }
1771                         else
1772                         {
1773                                 /* See if SYN's cross. Drop if boring */
1774                                 if(th->syn && !th->rst)
1775                                 {
1776                                         /* Crossed SYN's are fine - but talking to
1777                                            yourself is right out... */
1778                                         if(sk->saddr==saddr && sk->daddr==daddr &&
1779                                                 sk->dummy_th.source==th->source &&
1780                                                 sk->dummy_th.dest==th->dest)
1781                                         {
1782                                                 tcp_statistics.TcpAttemptFails++;
1783                                                 return tcp_reset(sk,skb);
1784                                         }
1785                                         tcp_set_state(sk,TCP_SYN_RECV);
1786                                         
1787                                         /*
1788                                          *      FIXME:
1789                                          *      Must send SYN|ACK here
1790                                          */
1791                                 }               
1792                                 /* Discard junk segment */
1793                                 kfree_skb(skb, FREE_READ);
1794                                 return 0;
1795                         }
1796                         /*
1797                          *      SYN_RECV with data maybe.. drop through
1798                          */
1799                         goto rfc_step6;
1800                 }
1801 
1802         /*
1803          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1804          *      a more complex suggestion for fixing these reuse issues in RFC1644
1805          *      but not yet ready for general use. Also see RFC1379.
1806          *
1807          *      Note the funny way we go back to the top of this function for
1808          *      this case ("goto try_next_socket").  That also takes care of
1809          *      checking "sk->users" for the new socket as well as doing all
1810          *      the normal tests on the packet.
1811          */
1812         
1813 #define BSD_TIME_WAIT
1814 #ifdef BSD_TIME_WAIT
1815                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
1816                         after(skb->seq, sk->acked_seq) && !th->rst)
1817                 {
1818                         u32 seq = sk->write_seq;
1819                         if(sk->debug)
1820                                 printk("Doing a BSD time wait\n");
1821                         tcp_statistics.TcpEstabResets++;           
1822                         atomic_sub(skb->truesize, &sk->rmem_alloc);
1823                         skb->sk = NULL;
1824                         sk->err=ECONNRESET;
1825                         tcp_set_state(sk, TCP_CLOSE);
1826                         sk->shutdown = SHUTDOWN_MASK;
1827                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1828                         /* this is not really correct: we should check sk->users */
1829                         if (sk && sk->state==TCP_LISTEN)
1830                         {
1831                                 skb->sk = sk;
1832                                 atomic_add(skb->truesize, &sk->rmem_alloc);
1833                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1834                                 return 0;
1835                         }
1836                         kfree_skb(skb, FREE_READ);
1837                         return 0;
1838                 }
1839 #endif  
1840         }
1841 
1842         /*
1843          *      We are now in normal data flow (see the step list in the RFC)
1844          *      Note most of these are inline now. I'll inline the lot when
1845          *      I have time to test it hard and look at what gcc outputs 
1846          */
1847         
1848         if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1849         {
1850                 bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
1851                 kfree_skb(skb, FREE_READ);
1852                 return 0;
1853         }
1854 
1855         if(th->rst)
1856                 return tcp_reset(sk,skb);
1857         
1858         /*
1859          *      !syn_ok is effectively the state test in RFC793.
1860          */
1861          
1862         if(th->syn && !syn_ok)
1863         {
1864                 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1865                 return tcp_reset(sk,skb);       
1866         }
1867 
1868         tcp_delack_estimator(sk);
1869         
1870         /*
1871          *      Process the ACK
1872          */
1873          
1874 
1875         if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1876         {
1877                 /*
1878                  *      Our three way handshake failed.
1879                  */
1880                  
1881                 if(sk->state==TCP_SYN_RECV)
1882                 {
1883                         tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1884                 }
1885                 kfree_skb(skb, FREE_READ);
1886                 return 0;
1887         }
1888         
1889 rfc_step6:              /* I'll clean this up later */
1890 
1891         /*
1892          *      If the accepted buffer put us over our queue size we
1893          *      now drop it (we must process the ack first to avoid
1894          *      deadlock cases).
1895          */
1896 
1897         /*
1898          *      Process urgent data
1899          */
1900                 
1901         tcp_urg(sk, th, len);
1902         
1903         /*
1904          *      Process the encapsulated data
1905          */
1906         
1907         if(tcp_data(skb,sk, saddr, len))
1908                 kfree_skb(skb, FREE_READ);
1909 
1910         /*
1911          *      If our receive queue has grown past its limits,
1912          *      try to prune away duplicates etc..
1913          */
1914         if (sk->rmem_alloc > sk->rcvbuf)
1915                 prune_queue(&sk->receive_queue);
1916 
1917         /*
1918          *      And done
1919          */     
1920         
1921         return 0;
1922 
1923 no_tcp_socket:
1924         /*
1925          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1926          */
1927         tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1928 
1929 discard_it:
1930         /*
1931          *      Discard frame
1932          */
1933         skb->sk = NULL;
1934         kfree_skb(skb, FREE_READ);
1935         return 0;
1936 }
/* */
root/net/ipv4/tcp_input.c

DEFINITIONS