root/net/ipv4/tcp_input.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_delack_estimator
  2. tcp_rtt_estimator
  3. tcp_cache_zap
  4. get_tcp_sock
  5. bad_tcp_sequence
  6. tcp_sequence
  7. tcp_reset
  8. tcp_options
  9. tcp_conn_request
  10. tcp_window_shrunk
  11. tcp_ack
  12. tcp_fin
  13. tcp_insert_skb
  14. tcp_queue_ack
  15. tcp_queue
  16. tcp_data
  17. tcp_check_urg
  18. tcp_urg
  19. tcp_remove_dups
  20. prune_queue
  21. tcp_rcv

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp_input.c 1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * FIXES
  23  *              Pedro Roque     :       Double ACK bug
  24  */
  25 
  26 #include <linux/config.h>
  27 #include <net/tcp.h>
  28 
  29 /*
  30  *      Policy code extracted so its now separate
  31  */
  32 
  33 /*
  34  *      Called each time to estimate the delayed ack timeout. This is
  35  *      how it should be done so a fast link isn't impacted by ack delay.
  36  */
  37  
  38 extern __inline__ void tcp_delack_estimator(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
  39 {
  40         /*
  41          *      Delayed ACK time estimator.
  42          */
  43         
  44         if (sk->lrcvtime == 0) 
  45         {
  46                 sk->lrcvtime = jiffies;
  47                 sk->ato = HZ/3;
  48         }
  49         else 
  50         {
  51                 int m;
  52                 
  53                 m = jiffies - sk->lrcvtime;
  54 
  55                 sk->lrcvtime = jiffies;
  56 
  57                 if (m <= 0)
  58                         m = 1;
  59 
  60                 if (m > (sk->rtt >> 3)) 
  61                 {
  62                         sk->ato = sk->rtt >> 3;
  63                         /*
  64                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
  65                          */
  66                 }
  67                 else 
  68                 {
  69                         sk->ato = (sk->ato >> 1) + m;
  70                         /*
  71                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
  72                          */
  73                 }
  74         }
  75 }
  76 
  77 /*
  78  *      Called on frames that were known _not_ to have been
  79  *      retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
  80  *      The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
  81  */
  82  
  83 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
     /* [previous][next][first][last][top][bottom][index][help] */
  84 {
  85         long m;
  86         /*
  87          *      The following amusing code comes from Jacobson's
  88          *      article in SIGCOMM '88.  Note that rtt and mdev
  89          *      are scaled versions of rtt and mean deviation.
  90          *      This is designed to be as fast as possible 
  91          *      m stands for "measurement".
  92          */
  93         
  94         m = jiffies - oskb->when;  /* RTT */
  95         if(m<=0)
  96                 m=1;            /* IS THIS RIGHT FOR <0 ??? */
  97         m -= (sk->rtt >> 3);    /* m is now error in rtt est */
  98         sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
  99         if (m < 0)
 100                 m = -m;         /* m is now abs(error) */
 101         m -= (sk->mdev >> 2);   /* similar update on mdev */
 102         sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 103 
 104         /*
 105          *      Now update timeout.  Note that this removes any backoff.
 106          */
 107                          
 108         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 109         if (sk->rto > 120*HZ)
 110                 sk->rto = 120*HZ;
 111         if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
 112                 sk->rto = HZ/5;
 113         sk->backoff = 0;
 114 }
 115 
 116 /*
 117  *      Cached last hit socket
 118  */
 119  
 120 static volatile unsigned long   th_cache_saddr, th_cache_daddr;
 121 static volatile unsigned short  th_cache_dport, th_cache_sport;
 122 static volatile struct sock *th_cache_sk;
 123 
 124 void tcp_cache_zap(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 125 {
 126         th_cache_sk=NULL;
 127 }
 128 
 129 /*
 130  *      Find the socket, using the last hit cache if applicable. The cache is not quite
 131  *      right...
 132  */
 133 
 134 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
     /* [previous][next][first][last][top][bottom][index][help] */
 135 {
 136         struct sock * sk;
 137 
 138         sk = (struct sock *) th_cache_sk;
 139         if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
 140             sport != th_cache_sport || dport != th_cache_dport) {
 141                 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
 142                 if (sk) {
 143                         th_cache_saddr=saddr;
 144                         th_cache_daddr=daddr;
 145                         th_cache_dport=dport;
 146                         th_cache_sport=sport;
 147                         th_cache_sk=sk;
 148                 }
 149         }
 150         return sk;
 151 }
 152 
 153 /*
 154  * React to a out-of-window TCP sequence number in an incoming packet
 155  */
 156  
 157 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, u32 end_seq,
     /* [previous][next][first][last][top][bottom][index][help] */
 158               struct device *dev)
 159 {
 160         if (th->rst)
 161                 return;
 162 
 163         /*
 164          *      Send a reset if we get something not ours and we are
 165          *      unsynchronized. Note: We don't do anything to our end. We
 166          *      are just killing the bogus remote connection then we will
 167          *      connect again and it will work (with luck).
 168          */
 169          
 170         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
 171         {
 172                 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
 173                 return;
 174         }
 175 
 176         /*
 177          *      4.3reno machines look for these kind of acks so they can do fast
 178          *      recovery. Three identical 'old' acks lets it know that one frame has
 179          *      been lost and should be resent. Because this is before the whole window
 180          *      of data has timed out it can take one lost frame per window without
 181          *      stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
 182          */
 183         tcp_send_ack(sk);
 184 }
 185 
 186 /*
 187  *      This functions checks to see if the tcp header is actually acceptable. 
 188  */
 189  
 190 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
     /* [previous][next][first][last][top][bottom][index][help] */
 191 {
 192         u32 end_window = sk->acked_seq + sk->window;
 193         return  /* if start is at end of window, end must be too (zero window) */
 194                 (seq == end_window && seq == end_seq) ||
 195                 /* if start is before end of window, check for interest */
 196                 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
 197 }
 198 
 199 /*
 200  *      When we get a reset we do this. This probably is a tcp_output routine
 201  *      really.
 202  */
 203 
 204 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
 205 {
 206         sk->zapped = 1;
 207         /*
 208          *      We want the right error as BSD sees it (and indeed as we do).
 209          */
 210         sk->err = ECONNRESET;
 211         if (sk->state == TCP_SYN_SENT)
 212                 sk->err = ECONNREFUSED;
 213         if (sk->state == TCP_CLOSE_WAIT)
 214                 sk->err = EPIPE;
 215 #ifdef CONFIG_TCP_RFC1337
 216         /*
 217          *      Time wait assassination protection [RFC1337]
 218          *
 219          *      This is a good idea, but causes more sockets to take time to close.
 220          *
 221          *      Ian Heavens has since shown this is an inadequate fix for the protocol
 222          *      bug in question.
 223          */
 224         if(sk->state!=TCP_TIME_WAIT)
 225         {       
 226                 tcp_set_state(sk,TCP_CLOSE);
 227                 sk->shutdown = SHUTDOWN_MASK;
 228         }
 229 #else   
 230         tcp_set_state(sk,TCP_CLOSE);
 231         sk->shutdown = SHUTDOWN_MASK;
 232 #endif  
 233         if (!sk->dead) 
 234                 sk->state_change(sk);
 235         kfree_skb(skb, FREE_READ);
 236         return(0);
 237 }
 238 
 239 
 240 /*
 241  *      Look for tcp options. Parses everything but only knows about MSS.
 242  *      This routine is always called with the packet containing the SYN.
 243  *      However it may also be called with the ack to the SYN.  So you
 244  *      can't assume this is always the SYN.  It's always called after
 245  *      we have set up sk->mtu to our own MTU.
 246  *
 247  *      We need at minimum to add PAWS support here. Possibly large windows
 248  *      as Linux gets deployed on 100Mb/sec networks.
 249  */
 250  
 251 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
 252 {
 253         unsigned char *ptr;
 254         int length=(th->doff*4)-sizeof(struct tcphdr);
 255         int mss_seen = 0;
 256     
 257         ptr = (unsigned char *)(th + 1);
 258   
 259         while(length>0)
 260         {
 261                 int opcode=*ptr++;
 262                 int opsize=*ptr++;
 263                 switch(opcode)
 264                 {
 265                         case TCPOPT_EOL:
 266                                 return;
 267                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 268                                 length--;
 269                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
 270                                 continue;
 271                         
 272                         default:
 273                                 if(opsize<=2)   /* Avoid silly options looping forever */
 274                                         return;
 275                                 switch(opcode)
 276                                 {
 277                                         case TCPOPT_MSS:
 278                                                 if(opsize==4 && th->syn)
 279                                                 {
 280                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
 281                                                         mss_seen = 1;
 282                                                 }
 283                                                 break;
 284                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
 285                                 }
 286                                 ptr+=opsize-2;
 287                                 length-=opsize;
 288                 }
 289         }
 290         if (th->syn) 
 291         {
 292                 if (! mss_seen)
 293                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
 294         }
 295 #ifdef CONFIG_INET_PCTCP
 296         sk->mss = min(sk->max_window >> 1, sk->mtu);
 297 #else    
 298         sk->mss = min(sk->max_window, sk->mtu);
 299         sk->max_unacked = 2 * sk->mss;
 300 #endif  
 301 }
 302 
 303 
 304 /*
 305  *      This routine handles a connection request.
 306  *      It should make sure we haven't already responded.
 307  *      Because of the way BSD works, we have to send a syn/ack now.
 308  *      This also means it will be harder to close a socket which is
 309  *      listening.
 310  */
 311  
 312 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
 313                  u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
 314 {
 315         struct sock *newsk;
 316         struct tcphdr *th;
 317         struct rtable *rt;
 318   
 319         th = skb->h.th;
 320 
 321         /* If the socket is dead, don't accept the connection. */
 322         if (!sk->dead) 
 323         {
 324                 sk->data_ready(sk,0);
 325         }
 326         else 
 327         {
 328                 if(sk->debug)
 329                         printk("Reset on %p: Connect on dead socket.\n",sk);
 330                 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
 331                 tcp_statistics.TcpAttemptFails++;
 332                 kfree_skb(skb, FREE_READ);
 333                 return;
 334         }
 335 
 336         /*
 337          *      Make sure we can accept more.  This will prevent a
 338          *      flurry of syns from eating up all our memory.
 339          *
 340          *      BSD does some funnies here and allows 3/2 times the
 341          *      set backlog as a fudge factor. Thats just too gross.
 342          */
 343 
 344         if (sk->ack_backlog >= sk->max_ack_backlog) 
 345         {
 346                 tcp_statistics.TcpAttemptFails++;
 347                 kfree_skb(skb, FREE_READ);
 348                 return;
 349         }
 350 
 351         /*
 352          * We need to build a new sock struct.
 353          * It is sort of bad to have a socket without an inode attached
 354          * to it, but the wake_up's will just wake up the listening socket,
 355          * and if the listening socket is destroyed before this is taken
 356          * off of the queue, this will take care of it.
 357          */
 358 
 359         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
 360         if (newsk == NULL) 
 361         {
 362                 /* just ignore the syn.  It will get retransmitted. */
 363                 tcp_statistics.TcpAttemptFails++;
 364                 kfree_skb(skb, FREE_READ);
 365                 return;
 366         }
 367 
 368         memcpy(newsk, sk, sizeof(*newsk));
 369         newsk->opt = NULL;
 370         newsk->ip_route_cache  = NULL;
 371         if (opt && opt->optlen) 
 372         {
 373                 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
 374                 if (!sk->opt) 
 375                 {
 376                         kfree_s(newsk, sizeof(struct sock));
 377                         tcp_statistics.TcpAttemptFails++;
 378                         kfree_skb(skb, FREE_READ);
 379                         return;
 380                 }
 381                 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
 382                 {
 383                         kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
 384                         kfree_s(newsk, sizeof(struct sock));
 385                         tcp_statistics.TcpAttemptFails++;
 386                         kfree_skb(skb, FREE_READ);
 387                         return;
 388                 }
 389         }
 390         skb_queue_head_init(&newsk->write_queue);
 391         skb_queue_head_init(&newsk->receive_queue);
 392         newsk->send_head = NULL;
 393         newsk->send_tail = NULL;
 394         skb_queue_head_init(&newsk->back_log);
 395         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
 396         newsk->rto = TCP_TIMEOUT_INIT;
 397         newsk->mdev = TCP_TIMEOUT_INIT<<1;
 398         newsk->max_window = 0;
 399         newsk->cong_window = 1;
 400         newsk->cong_count = 0;
 401         newsk->ssthresh = 0;
 402         newsk->backoff = 0;
 403         newsk->blog = 0;
 404         newsk->intr = 0;
 405         newsk->proc = 0;
 406         newsk->done = 0;
 407         newsk->partial = NULL;
 408         newsk->pair = NULL;
 409         newsk->wmem_alloc = 0;
 410         newsk->rmem_alloc = 0;
 411         newsk->localroute = sk->localroute;
 412 
 413         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
 414 
 415         newsk->err = 0;
 416         newsk->shutdown = 0;
 417         newsk->ack_backlog = 0;
 418         newsk->acked_seq = skb->seq+1;
 419         newsk->lastwin_seq = skb->seq+1;
 420         newsk->delay_acks = 1;
 421         newsk->copied_seq = skb->seq+1;
 422         newsk->fin_seq = skb->seq;
 423         newsk->state = TCP_SYN_RECV;
 424         newsk->timeout = 0;
 425         newsk->ip_xmit_timeout = 0;
 426         newsk->write_seq = seq; 
 427         newsk->window_seq = newsk->write_seq;
 428         newsk->rcv_ack_seq = newsk->write_seq;
 429         newsk->urg_data = 0;
 430         newsk->retransmits = 0;
 431         newsk->linger=0;
 432         newsk->destroy = 0;
 433         init_timer(&newsk->timer);
 434         newsk->timer.data = (unsigned long)newsk;
 435         newsk->timer.function = &net_timer;
 436         init_timer(&newsk->delack_timer);
 437         newsk->delack_timer.data = (unsigned long)newsk;
 438         newsk->delack_timer.function = tcp_delack_timer;
 439         init_timer(&newsk->retransmit_timer);
 440         newsk->retransmit_timer.data = (unsigned long)newsk;
 441         newsk->retransmit_timer.function = tcp_retransmit_timer;
 442         newsk->dummy_th.source = skb->h.th->dest;
 443         newsk->dummy_th.dest = skb->h.th->source;
 444         
 445         /*
 446          *      Swap these two, they are from our point of view. 
 447          */
 448          
 449         newsk->daddr = saddr;
 450         newsk->saddr = daddr;
 451         newsk->rcv_saddr = daddr;
 452 
 453         put_sock(newsk->num,newsk);
 454         newsk->acked_seq = skb->seq + 1;
 455         newsk->copied_seq = skb->seq + 1;
 456         newsk->socket = NULL;
 457 
 458         /*
 459          *      Grab the ttl and tos values and use them 
 460          */
 461 
 462         newsk->ip_ttl=sk->ip_ttl;
 463         newsk->ip_tos=skb->ip_hdr->tos;
 464 
 465         /*
 466          *      Use 512 or whatever user asked for 
 467          */
 468 
 469         /*
 470          *      Note use of sk->user_mss, since user has no direct access to newsk 
 471          */
 472 
 473         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
 474         newsk->ip_route_cache = rt;
 475         
 476         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
 477                 newsk->window_clamp = rt->rt_window;
 478         else
 479                 newsk->window_clamp = 0;
 480                 
 481         if (sk->user_mss)
 482                 newsk->mtu = sk->user_mss;
 483         else if (rt)
 484                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 485         else 
 486                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
 487 
 488         /*
 489          *      But not bigger than device MTU 
 490          */
 491 
 492         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
 493 
 494 #ifdef CONFIG_SKIP
 495         
 496         /*
 497          *      SKIP devices set their MTU to 65535. This is so they can take packets
 498          *      unfragmented to security process then fragment. They could lie to the
 499          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
 500          *      simply because the final package we want unfragmented is going to be
 501          *
 502          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
 503          */
 504          
 505         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
 506                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
 507 #endif
 508         /*
 509          *      This will min with what arrived in the packet 
 510          */
 511 
 512         tcp_options(newsk,skb->h.th);
 513         
 514         tcp_cache_zap();
 515         tcp_send_synack(newsk, sk, skb);
 516 }
 517 
 518 
 519 /*
 520  * Handle a TCP window that shrunk on us. It shouldn't happen,
 521  * but..
 522  *
 523  * We may need to move packets from the send queue
 524  * to the write queue, if the window has been shrunk on us.
 525  * The RFC says you are not allowed to shrink your window
 526  * like this, but if the other end does, you must be able
 527  * to deal with it.
 528  */
 529 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
     /* [previous][next][first][last][top][bottom][index][help] */
 530 {
 531         struct sk_buff *skb;
 532         struct sk_buff *skb2;
 533         struct sk_buff *wskb = NULL;
 534         
 535         skb2 = sk->send_head;
 536         sk->send_head = NULL;
 537         sk->send_tail = NULL;
 538 
 539         /*
 540          *      This is an artifact of a flawed concept. We want one
 541          *      queue and a smarter send routine when we send all.
 542          */
 543         cli();
 544         while (skb2 != NULL) 
 545         {
 546                 skb = skb2;
 547                 skb2 = skb->link3;
 548                 skb->link3 = NULL;
 549                 if (after(skb->end_seq, window_seq)) 
 550                 {
 551                         if (sk->packets_out > 0) 
 552                                 sk->packets_out--;
 553                         /* We may need to remove this from the dev send list. */
 554                         if (skb->next != NULL) 
 555                         {
 556                                 skb_unlink(skb);                                
 557                         }
 558                         /* Now add it to the write_queue. */
 559                         if (wskb == NULL)
 560                                 skb_queue_head(&sk->write_queue,skb);
 561                         else
 562                                 skb_append(wskb,skb);
 563                         wskb = skb;
 564                 } 
 565                 else 
 566                 {
 567                         if (sk->send_head == NULL) 
 568                         {
 569                                 sk->send_head = skb;
 570                                 sk->send_tail = skb;
 571                         }
 572                         else
 573                         {
 574                                 sk->send_tail->link3 = skb;
 575                                 sk->send_tail = skb;
 576                         }
 577                         skb->link3 = NULL;
 578                 }
 579         }
 580         sti();
 581 }
 582 
 583 
 584 /*
 585  *      This routine deals with incoming acks, but not outgoing ones.
 586  *
 587  *      This routine is totally _WRONG_. The list structuring is wrong,
 588  *      the algorithm is wrong, the code is wrong.
 589  */
 590 
 591 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
 592 {
 593         int flag = 0;
 594         u32 window_seq;
 595 
 596         /* 
 597          * 1 - there was data in packet as well as ack or new data is sent or 
 598          *     in shutdown state
 599          * 2 - data from retransmit queue was acked and removed
 600          * 4 - window shrunk or data from retransmit queue was acked and removed
 601          */
 602 
 603         if(sk->zapped)
 604                 return(1);      /* Dead, cant ack any more so why bother */
 605 
 606         /*
 607          *      We have dropped back to keepalive timeouts. Thus we have
 608          *      no retransmits pending.
 609          */
 610          
 611         if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
 612                 sk->retransmits = 0;
 613 
 614         /*
 615          *      If the ack is newer than sent or older than previous acks
 616          *      then we can probably ignore it.
 617          */
 618          
 619         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
 620                 goto uninteresting_ack;
 621 
 622         /*
 623          *      If there is data set flag 1
 624          */
 625          
 626         if (len != th->doff*4) 
 627                 flag |= 1;
 628 
 629         /*
 630          *      Have we discovered a larger window
 631          */
 632         window_seq = ntohs(th->window);
 633         if (window_seq > sk->max_window) 
 634         {
 635                 sk->max_window = window_seq;
 636 #ifdef CONFIG_INET_PCTCP
 637                 /* Hack because we don't send partial packets to non SWS
 638                    handling hosts */
 639                 sk->mss = min(window_seq>>1, sk->mtu);
 640 #else
 641                 sk->mss = min(window_seq, sk->mtu);
 642 #endif  
 643         }
 644         window_seq += ack;
 645 
 646         /*
 647          *      See if our window has been shrunk. 
 648          */
 649         if (after(sk->window_seq, window_seq)) {
 650                 flag |= 4;
 651                 tcp_window_shrunk(sk, window_seq);
 652         }
 653 
 654         /*
 655          *      Pipe has emptied
 656          */      
 657         if (sk->send_tail == NULL || sk->send_head == NULL) 
 658         {
 659                 sk->send_head = NULL;
 660                 sk->send_tail = NULL;
 661                 sk->packets_out= 0;
 662         }
 663 
 664         /*
 665          *      We don't want too many packets out there. 
 666          */
 667          
 668         if (sk->ip_xmit_timeout == TIME_WRITE && 
 669                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
 670         {
 671                 
 672                 /* 
 673                  * This is Jacobson's slow start and congestion avoidance. 
 674                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
 675                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
 676                  * counter and increment it once every cwnd times.  It's possible
 677                  * that this should be done only if sk->retransmits == 0.  I'm
 678                  * interpreting "new data is acked" as including data that has
 679                  * been retransmitted but is just now being acked.
 680                  */
 681                 if (sk->cong_window < sk->ssthresh)  
 682                         /* 
 683                          *      In "safe" area, increase
 684                          */
 685                         sk->cong_window++;
 686                 else 
 687                 {
 688                         /*
 689                          *      In dangerous area, increase slowly.  In theory this is
 690                          *      sk->cong_window += 1 / sk->cong_window
 691                          */
 692                         if (sk->cong_count >= sk->cong_window) 
 693                         {
 694                                 sk->cong_window++;
 695                                 sk->cong_count = 0;
 696                         }
 697                         else 
 698                                 sk->cong_count++;
 699                 }
 700         }
 701 
 702         /*
 703          *      Remember the highest ack received and update the
 704          *      right hand window edge of the host.
 705          *      We do a bit of work here to track number of times we've
 706          *      seen this ack without a change in the right edge of the
 707          *      window and no data in the packet.
 708          *      This will allow us to do fast retransmits.
 709          */
 710 
 711         /* We are looking for duplicate ACKs here.
 712          * An ACK is a duplicate if:
 713          * (1) it has the same sequence number as the largest number we've seen,
 714          * (2) it has the same window as the last ACK,
 715          * (3) we have outstanding data that has not been ACKed
 716          * (4) The packet was not carrying any data.
 717          * I've tried to order these in occurance of most likely to fail
 718          * to least likely to fail.
 719          * [These are the rules BSD stacks use to determine if an ACK is a
 720          *  duplicate.]
 721          */
 722 
 723         if (sk->rcv_ack_seq == ack
 724                 && sk->window_seq == window_seq
 725                 && !(flag&1)
 726                 && before(ack, sk->sent_seq))
 727         {
 728                 /* See draft-stevens-tcpca-spec-01 for explanation
 729                  * of what we are doing here.
 730                  */
 731                 sk->rcv_ack_cnt++;
 732                 if (sk->rcv_ack_cnt == MAX_DUP_ACKS+1) {
 733                         sk->ssthresh = max(sk->cong_window >> 1, 2);
 734                         sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1;
 735                         tcp_do_retransmit(sk,0);
 736                         /* reduce the count. We don't want to be
 737                         * seen to be in "retransmit" mode if we
 738                         * are doing a fast retransmit.
 739                         */
 740                         sk->retransmits--;
 741                 } else if (sk->rcv_ack_cnt > MAX_DUP_ACKS+1) {
 742                         sk->cong_window++;
 743                         /*
 744                         * At this point we are suppose to transmit a NEW
 745                         * packet (not retransmit the missing packet,
 746                         * this would only get us into a retransmit war.)
 747                         * I think that having just adjusted cong_window
 748                         * we will transmit the new packet below.
 749                         */
 750                 }
 751         }
 752         else
 753         {
 754                 if (sk->rcv_ack_cnt > MAX_DUP_ACKS) {
 755                         sk->cong_window = sk->ssthresh;
 756                 }
 757                 sk->window_seq = window_seq;
 758                 sk->rcv_ack_seq = ack;
 759                 sk->rcv_ack_cnt = 1;
 760         }
 761         
 762         /*
 763          *      We passed data and got it acked, remove any soft error
 764          *      log. Something worked...
 765          */
 766          
 767         sk->err_soft = 0;
 768 
 769         /*
 770          *      If this ack opens up a zero window, clear backoff.  It was
 771          *      being used to time the probes, and is probably far higher than
 772          *      it needs to be for normal retransmission.
 773          */
 774 
 775         if (sk->ip_xmit_timeout == TIME_PROBE0) 
 776         {
 777                 sk->retransmits = 0;    /* Our probe was answered */
 778                 
 779                 /*
 780                  *      Was it a usable window open ?
 781                  */
 782                  
 783                 if (!skb_queue_empty(&sk->write_queue) &&   /* should always be true */
 784                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
 785                 {
 786                         sk->backoff = 0;
 787                         
 788                         /*
 789                          *      Recompute rto from rtt.  this eliminates any backoff.
 790                          */
 791 
 792                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 793                         if (sk->rto > 120*HZ)
 794                                 sk->rto = 120*HZ;
 795                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
 796                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
 797                                                    .2 of a second is going to need huge windows (SIGH) */
 798                         sk->rto = HZ/5;
 799                 }
 800         }
 801 
 802         /* 
 803          *      See if we can take anything off of the retransmit queue.
 804          */
 805 
 806         for (;;) {
 807                 struct sk_buff * skb = sk->send_head;
 808                 if (!skb)
 809                         break;
 810 
 811                 /* Check for a bug. */
 812                 if (skb->link3 && after(skb->end_seq, skb->link3->end_seq)) 
 813                         printk("INET: tcp.c: *** bug send_list out of order.\n");
 814                         
 815                 /*
 816                  *      If our packet is before the ack sequence we can
 817                  *      discard it as it's confirmed to have arrived the other end.
 818                  */
 819                  
 820                 if (after(skb->end_seq, ack))
 821                         break;
 822 
 823                 if (sk->retransmits) 
 824                 {       
 825                         /*
 826                          *      We were retransmitting.  don't count this in RTT est 
 827                          */
 828                         flag |= 2;
 829                 }
 830 
 831                 if ((sk->send_head = skb->link3) == NULL)
 832                 {
 833                         sk->send_tail = NULL;
 834                         sk->retransmits = 0;
 835                 }
 836                 /*
 837                  * Note that we only reset backoff and rto in the
 838                  * rtt recomputation code.  And that doesn't happen
 839                  * if there were retransmissions in effect.  So the
 840                  * first new packet after the retransmissions is
 841                  * sent with the backoff still in effect.  Not until
 842                  * we get an ack from a non-retransmitted packet do
 843                  * we reset the backoff and rto.  This allows us to deal
 844                  * with a situation where the network delay has increased
 845                  * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 846                  */
 847 
 848                 /*
 849                  *      We have one less packet out there. 
 850                  */
 851                          
 852                 if (sk->packets_out > 0) 
 853                         sk->packets_out --;
 854 
 855                 if (!(flag&2))  /* Not retransmitting */
 856                         tcp_rtt_estimator(sk,skb);
 857                 flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
 858                                    In this case as we just set it up */
 859                 IS_SKB(skb);
 860 
 861                 /*
 862                  *      We may need to remove this from the dev send list. 
 863                  */
 864                 cli();
 865                 if (skb->next)
 866                         skb_unlink(skb);
 867                 sti();
 868                 kfree_skb(skb, FREE_WRITE); /* write. */
 869                 if (!sk->dead)
 870                         sk->write_space(sk);
 871         }
 872 
 873         /*
 874          * XXX someone ought to look at this too.. at the moment, if skb_peek()
 875          * returns non-NULL, we complete ignore the timer stuff in the else
 876          * clause.  We ought to organize the code so that else clause can
 877          * (should) be executed regardless, possibly moving the PROBE timer
 878          * reset over.  The skb_peek() thing should only move stuff to the
 879          * write queue, NOT also manage the timer functions.
 880          */
 881 
 882         /*
 883          * Maybe we can take some stuff off of the write queue,
 884          * and put it onto the xmit queue.
 885          */
 886         if (skb_peek(&sk->write_queue) != NULL) 
 887         {
 888                 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
 889                         (sk->retransmits == 0 || 
 890                          sk->ip_xmit_timeout != TIME_WRITE ||
 891                          !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
 892                         && sk->packets_out < sk->cong_window) 
 893                 {
 894                         /*
 895                          *      Add more data to the send queue.
 896                          */
 897                         flag |= 1;
 898                         tcp_write_xmit(sk);
 899                 }
 900                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
 901                         sk->send_head == NULL &&
 902                         sk->ack_backlog == 0 &&
 903                         sk->state != TCP_TIME_WAIT) 
 904                 {
 905                         /*
 906                          *      Data to queue but no room.
 907                          */
 908                         tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
 909                 }               
 910         }
 911         else
 912         {
 913                 /*
 914                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
 915                  * from TCP_CLOSE we don't do anything
 916                  *
 917                  * from anything else, if there is write data (or fin) pending,
 918                  * we use a TIME_WRITE timeout, else if keepalive we reset to
 919                  * a KEEPALIVE timeout, else we delete the timer.
 920                  *
 921                  * We do not set flag for nominal write data, otherwise we may
 922                  * force a state where we start to write itsy bitsy tidbits
 923                  * of data.
 924                  */
 925 
 926                 switch(sk->state) {
 927                 case TCP_TIME_WAIT:
 928                         /*
 929                          * keep us in TIME_WAIT until we stop getting packets,
 930                          * reset the timeout.
 931                          */
 932                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 933                         break;
 934                 case TCP_CLOSE:
 935                         /*
 936                          * don't touch the timer.
 937                          */
 938                         break;
 939                 default:
 940                         /*
 941                          *      Must check send_head and write_queue
 942                          *      to determine which timeout to use.
 943                          */
 944                         if (sk->send_head || !skb_queue_empty(&sk->write_queue)) {
 945                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 946                         } else if (sk->keepopen) {
 947                                 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 948                         } else {
 949                                 del_timer(&sk->retransmit_timer);
 950                                 sk->ip_xmit_timeout = 0;
 951                         }
 952                         break;
 953                 }
 954         }
 955 
 956         /*
 957          *      We have nothing queued but space to send. Send any partial
 958          *      packets immediately (end of Nagle rule application).
 959          */
 960          
 961         if (sk->packets_out == 0
 962             && sk->partial != NULL
 963             && skb_queue_empty(&sk->write_queue)
 964             && sk->send_head == NULL) 
 965         {
 966                 flag |= 1;
 967                 tcp_send_partial(sk);
 968         }
 969 
 970         /*
 971          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
 972          * we are now waiting for an acknowledge to our FIN.  The other end is
 973          * already in TIME_WAIT.
 974          *
 975          * Move to TCP_CLOSE on success.
 976          */
 977 
 978         if (sk->state == TCP_LAST_ACK) 
 979         {
 980                 if (!sk->dead)
 981                         sk->state_change(sk);
 982                 if(sk->debug)
 983                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
 984                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
 985                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
 986                 {
 987                         flag |= 1;
 988                         sk->shutdown = SHUTDOWN_MASK;
 989                         tcp_set_state(sk,TCP_CLOSE);
 990                         return 1;
 991                 }
 992         }
 993 
 994         /*
 995          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
 996          *
 997          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
 998          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
 999          */
1000 
1001         if (sk->state == TCP_FIN_WAIT1) 
1002         {
1003 
1004                 if (!sk->dead) 
1005                         sk->state_change(sk);
1006                 if (sk->rcv_ack_seq == sk->write_seq) 
1007                 {
1008                         flag |= 1;
1009                         sk->shutdown |= SEND_SHUTDOWN;
1010                         tcp_set_state(sk, TCP_FIN_WAIT2);
1011                 }
1012         }
1013 
1014         /*
1015          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
1016          *
1017          *      Move to TIME_WAIT
1018          */
1019 
1020         if (sk->state == TCP_CLOSING) 
1021         {
1022 
1023                 if (!sk->dead) 
1024                         sk->state_change(sk);
1025                 if (sk->rcv_ack_seq == sk->write_seq) 
1026                 {
1027                         flag |= 1;
1028                         tcp_time_wait(sk);
1029                 }
1030         }
1031         
1032         /*
1033          *      Final ack of a three way shake 
1034          */
1035          
1036         if(sk->state==TCP_SYN_RECV)
1037         {
1038                 tcp_set_state(sk, TCP_ESTABLISHED);
1039                 tcp_options(sk,th);
1040                 sk->dummy_th.dest=th->source;
1041                 sk->copied_seq = sk->acked_seq;
1042                 if(!sk->dead)
1043                         sk->state_change(sk);
1044                 if(sk->max_window==0)
1045                 {
1046                         sk->max_window=32;      /* Sanity check */
1047                         sk->mss=min(sk->max_window,sk->mtu);
1048                 }
1049         }
1050         
1051         /*
1052          * I make no guarantees about the first clause in the following
1053          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
1054          * what conditions "!flag" would be true.  However I think the rest
1055          * of the conditions would prevent that from causing any
1056          * unnecessary retransmission. 
1057          *   Clearly if the first packet has expired it should be 
1058          * retransmitted.  The other alternative, "flag&2 && retransmits", is
1059          * harder to explain:  You have to look carefully at how and when the
1060          * timer is set and with what timeout.  The most recent transmission always
1061          * sets the timer.  So in general if the most recent thing has timed
1062          * out, everything before it has as well.  So we want to go ahead and
1063          * retransmit some more.  If we didn't explicitly test for this
1064          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1065          * would not be true.  If you look at the pattern of timing, you can
1066          * show that rto is increased fast enough that the next packet would
1067          * almost never be retransmitted immediately.  Then you'd end up
1068          * waiting for a timeout to send each packet on the retransmission
1069          * queue.  With my implementation of the Karn sampling algorithm,
1070          * the timeout would double each time.  The net result is that it would
1071          * take a hideous amount of time to recover from a single dropped packet.
1072          * It's possible that there should also be a test for TIME_WRITE, but
1073          * I think as long as "send_head != NULL" and "retransmit" is on, we've
1074          * got to be in real retransmission mode.
1075          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
1076          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1077          * As long as no further losses occur, this seems reasonable.
1078          */
1079         
1080         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1081                (((flag&2) && sk->retransmits) ||
1082                (sk->send_head->when + sk->rto < jiffies)))
1083         {
1084                 if(sk->send_head->when + sk->rto < jiffies)
1085                         tcp_retransmit(sk,0);   
1086                 else
1087                 {
1088                         tcp_do_retransmit(sk, 1);
1089                         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1090                 }
1091         }
1092 
1093         return 1;
1094 
1095 uninteresting_ack:
1096         if(sk->debug)
1097                 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1098                         
1099         /*
1100          *      Keepalive processing.
1101          */
1102                  
1103         if (after(ack, sk->sent_seq)) 
1104         {
1105                 return 0;
1106         }
1107                 
1108         /*
1109          *      Restart the keepalive timer.
1110          */
1111                  
1112         if (sk->keepopen) 
1113         {
1114                 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1115                         tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1116         }
1117         return 1;
1118 }
1119 
1120 
1121 /*
1122  *      Process the FIN bit. This now behaves as it is supposed to work
1123  *      and the FIN takes effect when it is validly part of sequence
1124  *      space. Not before when we get holes.
1125  *
1126  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1127  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1128  *      TIME-WAIT)
1129  *
1130  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1131  *      close and we go into CLOSING (and later onto TIME-WAIT)
1132  *
1133  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1134  *
1135  */
1136  
1137 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
1138 {
1139         sk->fin_seq = skb->end_seq;
1140 
1141         if (!sk->dead) 
1142         {
1143                 sk->state_change(sk);
1144                 sock_wake_async(sk->socket, 1);
1145         }
1146 
1147         switch(sk->state) 
1148         {
1149                 case TCP_SYN_RECV:
1150                 case TCP_SYN_SENT:
1151                 case TCP_ESTABLISHED:
1152                         /*
1153                          * move to CLOSE_WAIT, tcp_data() already handled
1154                          * sending the ack.
1155                          */
1156                         tcp_set_state(sk,TCP_CLOSE_WAIT);
1157                         if (th->rst)
1158                                 sk->shutdown = SHUTDOWN_MASK;
1159                         break;
1160 
1161                 case TCP_CLOSE_WAIT:
1162                 case TCP_CLOSING:
1163                         /*
1164                          * received a retransmission of the FIN, do
1165                          * nothing.
1166                          */
1167                         break;
1168                 case TCP_TIME_WAIT:
1169                         /*
1170                          * received a retransmission of the FIN,
1171                          * restart the TIME_WAIT timer.
1172                          */
1173                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1174                         return(0);
1175                 case TCP_FIN_WAIT1:
1176                         /*
1177                          * This case occurs when a simultaneous close
1178                          * happens, we must ack the received FIN and
1179                          * enter the CLOSING state.
1180                          *
1181                          * This causes a WRITE timeout, which will either
1182                          * move on to TIME_WAIT when we timeout, or resend
1183                          * the FIN properly (maybe we get rid of that annoying
1184                          * FIN lost hang). The TIME_WRITE code is already correct
1185                          * for handling this timeout.
1186                          */
1187 
1188                         if(sk->ip_xmit_timeout != TIME_WRITE)
1189                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1190                         tcp_set_state(sk,TCP_CLOSING);
1191                         break;
1192                 case TCP_FIN_WAIT2:
1193                         /*
1194                          * received a FIN -- send ACK and enter TIME_WAIT
1195                          */
1196                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1197                         sk->shutdown|=SHUTDOWN_MASK;
1198                         tcp_set_state(sk,TCP_TIME_WAIT);
1199                         break;
1200                 case TCP_CLOSE:
1201                         /*
1202                          * already in CLOSE
1203                          */
1204                         break;
1205                 default:
1206                         tcp_set_state(sk,TCP_LAST_ACK);
1207         
1208                         /* Start the timers. */
1209                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1210                         return(0);
1211         }
1212 
1213         return(0);
1214 }
1215 
1216 /*
1217  * Add a sk_buff to the TCP receive queue, calculating
1218  * the ACK sequence as we go..
1219  */
1220 static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list)
     /* [previous][next][first][last][top][bottom][index][help] */
1221 {
1222         struct sk_buff * prev, * next;
1223         u32 seq;
1224 
1225         /*
1226          * Find where the new skb goes.. (This goes backwards,
1227          * on the assumption that we get the packets in order)
1228          */
1229         seq = skb->seq;
1230         prev = list->prev;
1231         next = (struct sk_buff *) list;
1232         for (;;) {
1233                 if (prev == (struct sk_buff *) list || !after(prev->seq, seq))
1234                         break;
1235                 next = prev;
1236                 prev = prev->prev;
1237         }
1238         __skb_insert(skb, prev, next, list);
1239 }
1240 
1241 /*
1242  * Called for each packet when we find a new ACK endpoint sequence in it
1243  */
1244 static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1245 {
1246         /*
1247          *      When we ack the fin, we do the FIN 
1248          *      processing.
1249          */
1250         skb->acked = 1;
1251         if (skb->h.th->fin)
1252                 tcp_fin(skb,sk,skb->h.th);
1253         return skb->end_seq;
1254 }       
1255 
1256 static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
1257 {
1258         u32 ack_seq;
1259 
1260         tcp_insert_skb(skb, &sk->receive_queue);
1261 
1262         /*
1263          * Did we get anything new to ack?
1264          */
1265         ack_seq = sk->acked_seq;
1266 
1267 
1268         if (!after(skb->seq, ack_seq)) {
1269                 if (after(skb->end_seq, ack_seq)) {
1270                         /* the packet straddles our window end */
1271                         struct sk_buff_head * list = &sk->receive_queue;
1272                         struct sk_buff * next;
1273                         ack_seq = tcp_queue_ack(skb, sk);
1274 
1275                         /*
1276                          * Do we have any old packets to ack that the above
1277                          * made visible? (Go forward from skb)
1278                          */
1279                         next = skb->next;
1280                         while (next != (struct sk_buff *) list) {
1281                                 if (after(next->seq, ack_seq))
1282                                         break;
1283                                 if (after(next->end_seq, ack_seq))
1284                                         ack_seq = tcp_queue_ack(next, sk);
1285                                 next = next->next;
1286                         }
1287 
1288                         /*
1289                          * Ok, we found new data, update acked_seq as
1290                          * necessary (and possibly send the actual
1291                          * ACK packet).
1292                          */
1293                         sk->acked_seq = ack_seq;
1294 
1295                 } else {
1296                         if (sk->debug)
1297                                 printk("Ack duplicate packet.\n");
1298                         tcp_send_ack(sk);
1299                         return;
1300                 }
1301 
1302 
1303                 /*
1304                  * Delay the ack if possible.  Send ack's to
1305                  * fin frames immediately as there shouldn't be
1306                  * anything more to come.
1307                  */
1308                 if (!sk->delay_acks || th->fin) {
1309                         tcp_send_ack(sk);
1310                 } else {
1311                         /*
1312                          * If psh is set we assume it's an
1313                          * interactive session that wants quick
1314                          * acks to avoid nagling too much. 
1315                          */
1316                         int delay = HZ/2;
1317                         if (th->psh)
1318                                 delay = HZ/50;
1319                         tcp_send_delayed_ack(sk, delay);
1320                 }
1321 
1322                 /*
1323                  *      Tell the user we have some more data.
1324                  */
1325 
1326                 if (!sk->dead)
1327                         sk->data_ready(sk,0);
1328 
1329         }
1330         else
1331         {
1332             /*
1333              *  If we've missed a packet, send an ack.
1334              *  Also start a timer to send another.
1335              *
1336              *  4.3reno machines look for these kind of acks so
1337              *  they can do fast recovery. Three identical 'old'
1338              *  acks lets it know that one frame has been lost
1339              *      and should be resent. Because this is before the
1340              *  whole window of data has timed out it can take
1341              *  one lost frame per window without stalling.
1342              *  [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
1343              *
1344              *  We also should be spotting triple bad sequences.
1345              *  [We now do this.]
1346              *
1347              */
1348              
1349             if (!skb->acked) 
1350             {
1351                     if(sk->debug)
1352                             printk("Ack past end of seq packet.\n");
1353                     tcp_send_ack(sk);
1354                     tcp_send_delayed_ack(sk,HZ/2);
1355             }
1356         }
1357 }
1358 
1359 
1360 /*
1361  *      This routine handles the data.  If there is room in the buffer,
1362  *      it will be have already been moved into it.  If there is no
1363  *      room, then we will just have to discard the packet.
1364  */
1365 
1366 static int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
1367          unsigned long saddr, unsigned int len)
1368 {
1369         struct tcphdr *th;
1370         u32 new_seq, shut_seq;
1371 
1372         th = skb->h.th;
1373         skb_pull(skb,th->doff*4);
1374         skb_trim(skb,len-(th->doff*4));
1375 
1376         /*
1377          *      The bytes in the receive read/assembly queue has increased. Needed for the
1378          *      low memory discard algorithm 
1379          */
1380            
1381         sk->bytes_rcv += skb->len;
1382         
1383         if (skb->len == 0 && !th->fin) 
1384         {
1385                 /* 
1386                  *      Don't want to keep passing ack's back and forth. 
1387                  *      (someone sent us dataless, boring frame)
1388                  */
1389                 if (!th->ack)
1390                         tcp_send_ack(sk);
1391                 kfree_skb(skb, FREE_READ);
1392                 return(0);
1393         }
1394         
1395         /*
1396          *      We no longer have anyone receiving data on this connection.
1397          */
1398 
1399 #ifndef TCP_DONT_RST_SHUTDOWN            
1400 
1401         if(sk->shutdown & RCV_SHUTDOWN)
1402         {
1403                 /*
1404                  *      FIXME: BSD has some magic to avoid sending resets to
1405                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
1406                  *      BSD stacks still have broken keepalives so we want to
1407                  *      cope with it.
1408                  */
1409 
1410                 if(skb->len)    /* We don't care if it's just an ack or
1411                                    a keepalive/window probe */
1412                 {
1413                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
1414                         
1415                         /* Do this the way 4.4BSD treats it. Not what I'd
1416                            regard as the meaning of the spec but it's what BSD
1417                            does and clearly they know everything 8) */
1418 
1419                         /*
1420                          *      This is valid because of two things
1421                          *
1422                          *      a) The way tcp_data behaves at the bottom.
1423                          *      b) A fin takes effect when read not when received.
1424                          */
1425                          
1426                         shut_seq = sk->acked_seq+1;     /* Last byte */
1427                         
1428                         if(after(new_seq,shut_seq))
1429                         {
1430                                 if(sk->debug)
1431                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1432                                                 sk, new_seq, shut_seq, sk->blog);
1433                                 if(sk->dead)
1434                                 {
1435                                         sk->acked_seq = new_seq + th->fin;
1436                                         tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1437                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1438                                         tcp_statistics.TcpEstabResets++;
1439                                         sk->err = EPIPE;
1440                                         sk->error_report(sk);
1441                                         sk->shutdown = SHUTDOWN_MASK;
1442                                         tcp_set_state(sk,TCP_CLOSE);
1443                                         kfree_skb(skb, FREE_READ);
1444                                         return 0;
1445                                 }
1446                         }
1447                 }
1448         }
1449 
1450 #endif
1451 
1452         tcp_queue(skb, sk, th);
1453 
1454         return(0);
1455 }
1456 
1457 
1458 /*
1459  *      This routine is only called when we have urgent data
1460  *      signalled. Its the 'slow' part of tcp_urg. It could be
1461  *      moved inline now as tcp_urg is only called from one
1462  *      place. We handle URGent data wrong. We have to - as
1463  *      BSD still doesn't use the correction from RFC961.
1464  */
1465  
1466 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
1467 {
1468         u32 ptr = ntohs(th->urg_ptr);
1469 
1470         if (ptr)
1471                 ptr--;
1472         ptr += ntohl(th->seq);
1473 
1474         /* ignore urgent data that we've already seen and read */
1475         if (after(sk->copied_seq, ptr))
1476                 return;
1477 
1478         /* do we already have a newer (or duplicate) urgent pointer? */
1479         if (sk->urg_data && !after(ptr, sk->urg_seq))
1480                 return;
1481 
1482         /* tell the world about our new urgent pointer */
1483         if (sk->proc != 0) {
1484                 if (sk->proc > 0) {
1485                         kill_proc(sk->proc, SIGURG, 1);
1486                 } else {
1487                         kill_pg(-sk->proc, SIGURG, 1);
1488                 }
1489         }
1490         sk->urg_data = URG_NOTYET;
1491         sk->urg_seq = ptr;
1492 }
1493 
1494 /*
1495  *      This is the 'fast' part of urgent handling.
1496  */
1497  
1498 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
     /* [previous][next][first][last][top][bottom][index][help] */
1499 {
1500         /*
1501          *      Check if we get a new urgent pointer - normally not 
1502          */
1503          
1504         if (th->urg)
1505                 tcp_check_urg(sk,th);
1506 
1507         /*
1508          *      Do we wait for any urgent data? - normally not
1509          */
1510          
1511         if (sk->urg_data == URG_NOTYET) {
1512                 u32 ptr;
1513 
1514                 /*
1515                  *      Is the urgent pointer pointing into this packet? 
1516                  */      
1517                 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1518                 if (ptr < len) {
1519                         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1520                         if (!sk->dead)
1521                                 sk->data_ready(sk,0);
1522                 }
1523         }
1524 }
1525 
1526 /*
1527  * This should be a bit smarter and remove partially
1528  * overlapping stuff too, but this should be good
1529  * enough for any even remotely normal case (and the
1530  * worst that can happen is that we have a few
1531  * unnecessary packets in the receive queue).
1532  *
1533  * This function is never called with an empty list..
1534  */
1535 static inline void tcp_remove_dups(struct sk_buff_head * list)
     /* [previous][next][first][last][top][bottom][index][help] */
1536 {
1537         struct sk_buff * next = list->next;
1538 
1539         for (;;) {
1540                 struct sk_buff * skb = next;
1541                 next = next->next;
1542                 if (next == (struct sk_buff *) list)
1543                         break;
1544                 if (before(next->end_seq, skb->end_seq)) {
1545                         __skb_unlink(next, list);
1546                         kfree_skb(next, FREE_READ);
1547                         next = skb;
1548                         continue;
1549                 }
1550                 if (next->seq != skb->seq)
1551                         continue;
1552                 __skb_unlink(skb, list);
1553                 kfree_skb(skb, FREE_READ);
1554         }
1555 }
1556 
1557 /*
1558  * Throw out all unnecessary packets: we've gone over the
1559  * receive queue limit. This shouldn't happen in a normal
1560  * TCP connection, but we might have gotten duplicates etc.
1561  */
1562 static void prune_queue(struct sk_buff_head * list)
     /* [previous][next][first][last][top][bottom][index][help] */
1563 {
1564         for (;;) {
1565                 struct sk_buff * skb = list->prev;
1566 
1567                 /* gone through it all? */
1568                 if (skb == (struct sk_buff *) list)
1569                         break;
1570                 if (!skb->acked) {
1571                         __skb_unlink(skb, list);
1572                         kfree_skb(skb, FREE_READ);
1573                         continue;
1574                 }
1575                 tcp_remove_dups(list);
1576                 break;
1577         }
1578 }
1579 
1580 /*
1581  *      A TCP packet has arrived.
1582  *              skb->h.raw is the TCP header.
1583  */
1584  
1585 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
1586         __u32 daddr, unsigned short len,
1587         __u32 saddr, int redo, struct inet_protocol * protocol)
1588 {
1589         struct tcphdr *th;
1590         struct sock *sk;
1591         int syn_ok=0;
1592 
1593         /*
1594          * "redo" is 1 if we have already seen this skb but couldn't
1595          * use it at that time (the socket was locked).  In that case
1596          * we have already done a lot of the work (looked up the socket
1597          * etc).
1598          */
1599         th = skb->h.th;
1600         sk = skb->sk;
1601         if (!redo) {
1602                 tcp_statistics.TcpInSegs++;
1603                 if (skb->pkt_type!=PACKET_HOST)
1604                         goto discard_it;
1605 
1606                 /*
1607                  *      Pull up the IP header.
1608                  */
1609         
1610                 skb_pull(skb, skb->h.raw-skb->data);
1611 
1612                 /*
1613                  *      Try to use the device checksum if provided.
1614                  */
1615                 switch (skb->ip_summed) 
1616                 {
1617                         case CHECKSUM_NONE:
1618                                 skb->csum = csum_partial((char *)th, len, 0);
1619                         case CHECKSUM_HW:
1620                                 if (tcp_check(th, len, saddr, daddr, skb->csum))
1621                                         goto discard_it;
1622                         default:
1623                                 /* CHECKSUM_UNNECESSARY */
1624                 }
1625                 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1626                 if (!sk)
1627                         goto no_tcp_socket;
1628                 skb->sk = sk;
1629                 skb->seq = ntohl(th->seq);
1630                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1631                 skb->ack_seq = ntohl(th->ack_seq);
1632 
1633                 skb->acked = 0;
1634                 skb->used = 0;
1635                 skb->free = 1;
1636                 skb->saddr = daddr;
1637                 skb->daddr = saddr;
1638 
1639                 /*
1640                  * We may need to add it to the backlog here. 
1641                  */
1642                 if (sk->users) 
1643                 {
1644                         __skb_queue_tail(&sk->back_log, skb);
1645                         return(0);
1646                 }
1647         }
1648 
1649         /*
1650          *      If this socket has got a reset it's to all intents and purposes 
1651          *      really dead. Count closed sockets as dead.
1652          *
1653          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1654          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
1655          *      exist so should cause resets as if the port was unreachable.
1656          */
1657 
1658         if (sk->zapped || sk->state==TCP_CLOSE)
1659                 goto no_tcp_socket;
1660 
1661         if (!sk->prot) 
1662         {
1663                 printk("IMPOSSIBLE 3\n");
1664                 return(0);
1665         }
1666 
1667 
1668         /*
1669          *      Charge the memory to the socket. 
1670          */
1671          
1672         skb->sk=sk;
1673         atomic_add(skb->truesize, &sk->rmem_alloc);
1674         
1675         /*
1676          *      We should now do header prediction.
1677          */
1678          
1679         /*
1680          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1681          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1682          *      compatibility. We also set up variables more thoroughly [Karn notes in the
1683          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1684          */
1685 
1686         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
1687         {
1688         
1689                 /*
1690                  *      Now deal with unusual cases.
1691                  */
1692          
1693                 if(sk->state==TCP_LISTEN)
1694                 {
1695                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
1696                                 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1697 
1698                         /*
1699                          *      We don't care for RST, and non SYN are absorbed (old segments)
1700                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1701                          *      netmask on a running connection it can go broadcast. Even Sun's have
1702                          *      this problem so I'm ignoring it 
1703                          */
1704                            
1705                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1706                         {
1707                                 kfree_skb(skb, FREE_READ);
1708                                 return 0;
1709                         }
1710                 
1711                         /*      
1712                          *      Guess we need to make a new socket up 
1713                          */
1714                 
1715                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1716                 
1717                         /*
1718                          *      Now we have several options: In theory there is nothing else
1719                          *      in the frame. KA9Q has an option to send data with the syn,
1720                          *      BSD accepts data with the syn up to the [to be] advertised window
1721                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
1722                          *      it, that fits the spec precisely and avoids incompatibilities. It
1723                          *      would be nice in future to drop through and process the data.
1724                          *
1725                          *      Now TTCP is starting to use we ought to queue this data.
1726                          */
1727                          
1728                         return 0;
1729                 }
1730         
1731                 /* 
1732                  *      Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1733                  *      then its a new connection
1734                  */
1735                  
1736                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1737                 {
1738                         kfree_skb(skb, FREE_READ);
1739                         return 0;
1740                 }
1741                 
1742                 /*
1743                  *      SYN sent means we have to look for a suitable ack and either reset
1744                  *      for bad matches or go to connected. The SYN_SENT case is unusual and should
1745                  *      not be in line code. [AC]
1746                  */
1747            
1748                 if(sk->state==TCP_SYN_SENT)
1749                 {
1750                         /* Crossed SYN or previous junk segment */
1751                         if(th->ack)
1752                         {
1753                                 /* We got an ack, but it's not a good ack */
1754                                 if(!tcp_ack(sk,th,skb->ack_seq,len))
1755                                 {
1756                                         /* Reset the ack - its an ack from a 
1757                                            different connection  [ th->rst is checked in tcp_send_reset()] */
1758                                         tcp_statistics.TcpAttemptFails++;
1759                                         tcp_send_reset(daddr, saddr, th,
1760                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1761                                         kfree_skb(skb, FREE_READ);
1762                                         return(0);
1763                                 }
1764                                 if(th->rst)
1765                                         return tcp_reset(sk,skb);
1766                                 if(!th->syn)
1767                                 {
1768                                         /* A valid ack from a different connection
1769                                            start. Shouldn't happen but cover it */
1770                                         tcp_statistics.TcpAttemptFails++;
1771                                         tcp_send_reset(daddr, saddr, th,
1772                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1773                                         kfree_skb(skb, FREE_READ);
1774                                         return 0;
1775                                 }
1776                                 /*
1777                                  *      Ok.. it's good. Set up sequence numbers and
1778                                  *      move to established.
1779                                  */
1780                                 syn_ok=1;       /* Don't reset this connection for the syn */
1781                                 sk->acked_seq = skb->seq+1;
1782                                 sk->lastwin_seq = skb->seq+1;
1783                                 sk->fin_seq = skb->seq;
1784                                 tcp_send_ack(sk);
1785                                 tcp_set_state(sk, TCP_ESTABLISHED);
1786                                 tcp_options(sk,th);
1787                                 sk->dummy_th.dest=th->source;
1788                                 sk->copied_seq = sk->acked_seq;
1789                                 if(!sk->dead)
1790                                 {
1791                                         sk->state_change(sk);
1792                                         sock_wake_async(sk->socket, 0);
1793                                 }
1794                                 if(sk->max_window==0)
1795                                 {
1796                                         sk->max_window = 32;
1797                                         sk->mss = min(sk->max_window, sk->mtu);
1798                                 }
1799                         }
1800                         else
1801                         {
1802                                 /* See if SYN's cross. Drop if boring */
1803                                 if(th->syn && !th->rst)
1804                                 {
1805                                         /* Crossed SYN's are fine - but talking to
1806                                            yourself is right out... */
1807                                         if(sk->saddr==saddr && sk->daddr==daddr &&
1808                                                 sk->dummy_th.source==th->source &&
1809                                                 sk->dummy_th.dest==th->dest)
1810                                         {
1811                                                 tcp_statistics.TcpAttemptFails++;
1812                                                 return tcp_reset(sk,skb);
1813                                         }
1814                                         tcp_set_state(sk,TCP_SYN_RECV);
1815                                         
1816                                         /*
1817                                          *      FIXME:
1818                                          *      Must send SYN|ACK here
1819                                          */
1820                                 }               
1821                                 /* Discard junk segment */
1822                                 kfree_skb(skb, FREE_READ);
1823                                 return 0;
1824                         }
1825                         /*
1826                          *      SYN_RECV with data maybe.. drop through
1827                          */
1828                         goto rfc_step6;
1829                 }
1830 
1831         /*
1832          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1833          *      a more complex suggestion for fixing these reuse issues in RFC1644
1834          *      but not yet ready for general use. Also see RFC1379.
1835          *
1836          *      Note the funny way we go back to the top of this function for
1837          *      this case ("goto try_next_socket").  That also takes care of
1838          *      checking "sk->users" for the new socket as well as doing all
1839          *      the normal tests on the packet.
1840          */
1841         
1842 #define BSD_TIME_WAIT
1843 #ifdef BSD_TIME_WAIT
1844                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
1845                         after(skb->seq, sk->acked_seq) && !th->rst)
1846                 {
1847                         u32 seq = sk->write_seq;
1848                         if(sk->debug)
1849                                 printk("Doing a BSD time wait\n");
1850                         tcp_statistics.TcpEstabResets++;           
1851                         atomic_sub(skb->truesize, &sk->rmem_alloc);
1852                         skb->sk = NULL;
1853                         sk->err=ECONNRESET;
1854                         tcp_set_state(sk, TCP_CLOSE);
1855                         sk->shutdown = SHUTDOWN_MASK;
1856                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1857                         /* this is not really correct: we should check sk->users */
1858                         if (sk && sk->state==TCP_LISTEN)
1859                         {
1860                                 skb->sk = sk;
1861                                 atomic_add(skb->truesize, &sk->rmem_alloc);
1862                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1863                                 return 0;
1864                         }
1865                         kfree_skb(skb, FREE_READ);
1866                         return 0;
1867                 }
1868 #endif  
1869         }
1870 
1871         /*
1872          *      We are now in normal data flow (see the step list in the RFC)
1873          *      Note most of these are inline now. I'll inline the lot when
1874          *      I have time to test it hard and look at what gcc outputs 
1875          */
1876         
1877         if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1878         {
1879                 bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
1880                 kfree_skb(skb, FREE_READ);
1881                 return 0;
1882         }
1883 
1884         if(th->rst)
1885                 return tcp_reset(sk,skb);
1886         
1887         /*
1888          *      !syn_ok is effectively the state test in RFC793.
1889          */
1890          
1891         if(th->syn && !syn_ok)
1892         {
1893                 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1894                 return tcp_reset(sk,skb);       
1895         }
1896 
1897         tcp_delack_estimator(sk);
1898         
1899         /*
1900          *      Process the ACK
1901          */
1902          
1903 
1904         if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1905         {
1906                 /*
1907                  *      Our three way handshake failed.
1908                  */
1909                  
1910                 if(sk->state==TCP_SYN_RECV)
1911                 {
1912                         tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1913                 }
1914                 kfree_skb(skb, FREE_READ);
1915                 return 0;
1916         }
1917         
1918 rfc_step6:              /* I'll clean this up later */
1919 
1920         /*
1921          *      If the accepted buffer put us over our queue size we
1922          *      now drop it (we must process the ack first to avoid
1923          *      deadlock cases).
1924          */
1925 
1926         /*
1927          *      Process urgent data
1928          */
1929                 
1930         tcp_urg(sk, th, len);
1931         
1932         /*
1933          *      Process the encapsulated data
1934          */
1935         
1936         if(tcp_data(skb,sk, saddr, len))
1937                 kfree_skb(skb, FREE_READ);
1938 
1939         /*
1940          *      If our receive queue has grown past its limits,
1941          *      try to prune away duplicates etc..
1942          */
1943         if (sk->rmem_alloc > sk->rcvbuf)
1944                 prune_queue(&sk->receive_queue);
1945 
1946         /*
1947          *      And done
1948          */     
1949         
1950         return 0;
1951 
1952 no_tcp_socket:
1953         /*
1954          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1955          */
1956         tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1957 
1958 discard_it:
1959         /*
1960          *      Discard frame
1961          */
1962         skb->sk = NULL;
1963         kfree_skb(skb, FREE_READ);
1964         return 0;
1965 }

/* [previous][next][first][last][top][bottom][index][help] */