root/net/ipv4/tcp_input.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_delack_estimator
  2. tcp_rtt_estimator
  3. tcp_cache_zap
  4. get_tcp_sock
  5. bad_tcp_sequence
  6. tcp_sequence
  7. tcp_reset
  8. tcp_options
  9. tcp_conn_request
  10. tcp_window_shrunk
  11. tcp_ack
  12. tcp_fin
  13. tcp_insert_skb
  14. tcp_queue_ack
  15. tcp_queue
  16. tcp_data
  17. tcp_check_urg
  18. tcp_urg
  19. tcp_remove_dups
  20. prune_queue
  21. tcp_rcv

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp_input.c 1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * FIXES
  23  *              Pedro Roque     :       Double ACK bug
  24  */
  25 
  26 #include <linux/config.h>
  27 #include <net/tcp.h>
  28 
  29 #include <linux/interrupt.h>
  30 
  31 /*
  32  *      Policy code extracted so its now seperate
  33  */
  34 
  35 /*
  36  *      Called each time to estimate the delayed ack timeout. This is
  37  *      how it should be done so a fast link isnt impacted by ack delay.
  38  */
  39  
  40 extern __inline__ void tcp_delack_estimator(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
  41 {
  42         /*
  43          *      Delayed ACK time estimator.
  44          */
  45         
  46         if (sk->lrcvtime == 0) 
  47         {
  48                 sk->lrcvtime = jiffies;
  49                 sk->ato = HZ/3;
  50         }
  51         else 
  52         {
  53                 int m;
  54                 
  55                 m = jiffies - sk->lrcvtime;
  56 
  57                 sk->lrcvtime = jiffies;
  58 
  59                 if (m <= 0)
  60                         m = 1;
  61 
  62                 if (m > (sk->rtt >> 3)) 
  63                 {
  64                         sk->ato = sk->rtt >> 3;
  65                         /*
  66                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
  67                          */
  68                 }
  69                 else 
  70                 {
  71                         sk->ato = (sk->ato >> 1) + m;
  72                         /*
  73                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
  74                          */
  75                 }
  76         }
  77 }
  78 
  79 /*
  80  *      Called on frames that were known _not_ to have been
  81  *      retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
  82  *      The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
  83  */
  84  
  85 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
     /* [previous][next][first][last][top][bottom][index][help] */
  86 {
  87         long m;
  88         /*
  89          *      The following amusing code comes from Jacobson's
  90          *      article in SIGCOMM '88.  Note that rtt and mdev
  91          *      are scaled versions of rtt and mean deviation.
  92          *      This is designed to be as fast as possible 
  93          *      m stands for "measurement".
  94          */
  95         
  96         m = jiffies - oskb->when;  /* RTT */
  97         if(m<=0)
  98                 m=1;            /* IS THIS RIGHT FOR <0 ??? */
  99         m -= (sk->rtt >> 3);    /* m is now error in rtt est */
 100         sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
 101         if (m < 0)
 102                 m = -m;         /* m is now abs(error) */
 103         m -= (sk->mdev >> 2);   /* similar update on mdev */
 104         sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 105 
 106         /*
 107          *      Now update timeout.  Note that this removes any backoff.
 108          */
 109                          
 110         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 111         if (sk->rto > 120*HZ)
 112                 sk->rto = 120*HZ;
 113         if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
 114                 sk->rto = HZ/5;
 115         sk->backoff = 0;
 116 }
 117 
 118 /*
 119  *      Cached last hit socket
 120  */
 121  
 122 static volatile unsigned long   th_cache_saddr, th_cache_daddr;
 123 static volatile unsigned short  th_cache_dport, th_cache_sport;
 124 static volatile struct sock *th_cache_sk;
 125 
 126 void tcp_cache_zap(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 127 {
 128         th_cache_sk=NULL;
 129 }
 130 
 131 /*
 132  *      Find the socket, using the last hit cache if applicable. The cache is not quite
 133  *      right...
 134  */
 135 
 136 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
     /* [previous][next][first][last][top][bottom][index][help] */
 137 {
 138         struct sock * sk;
 139 
 140         sk = (struct sock *) th_cache_sk;
 141         if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
 142             sport != th_cache_sport || dport != th_cache_dport) {
 143                 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
 144                 if (sk) {
 145                         th_cache_saddr=saddr;
 146                         th_cache_daddr=daddr;
 147                         th_cache_dport=dport;
 148                         th_cache_sport=sport;
 149                         th_cache_sk=sk;
 150                 }
 151         }
 152         return sk;
 153 }
 154 
 155 /*
 156  * React to a out-of-window TCP sequence number in an incoming packet
 157  */
 158  
 159 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
 160              struct options *opt, unsigned long saddr, struct device *dev)
 161 {
 162         if (th->rst)
 163                 return;
 164 
 165         /*
 166          *      Send a reset if we get something not ours and we are
 167          *      unsynchronized. Note: We don't do anything to our end. We
 168          *      are just killing the bogus remote connection then we will
 169          *      connect again and it will work (with luck).
 170          */
 171          
 172         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
 173         {
 174                 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
 175                 return;
 176         }
 177         
 178         /*
 179          *      4.3reno machines look for these kind of acks so they can do fast
 180          *      recovery. Three identical 'old' acks lets it know that one frame has
 181          *      been lost and should be resent. Because this is before the whole window
 182          *      of data has timed out it can take one lost frame per window without
 183          *      stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
 184          *
 185          *      We also should be spotting triple bad sequences.
 186          */
 187         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
 188         return;
 189 }
 190 
 191 /*
 192  *      This functions checks to see if the tcp header is actually acceptable. 
 193  */
 194  
 195 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
     /* [previous][next][first][last][top][bottom][index][help] */
 196 {
 197         u32 end_window = sk->acked_seq + sk->window;
 198         return  /* if start is at end of window, end must be too (zero window) */
 199                 (seq == end_window && seq == end_seq) ||
 200                 /* if start is before end of window, check for interest */
 201                 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
 202 }
 203 
 204 /*
 205  *      When we get a reset we do this. This probably is a tcp_output routine
 206  *      really.
 207  */
 208 
 209 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
 210 {
 211         sk->zapped = 1;
 212         /*
 213          *      We want the right error as BSD sees it (and indeed as we do).
 214          */
 215         sk->err = ECONNRESET;
 216         if (sk->state == TCP_SYN_SENT)
 217                 sk->err = ECONNREFUSED;
 218         if (sk->state == TCP_CLOSE_WAIT)
 219                 sk->err = EPIPE;
 220 #ifdef CONFIG_TCP_RFC1337
 221         /*
 222          *      Time wait assassination protection [RFC1337]
 223          *
 224          *      This is a good idea, but causes more sockets to take time to close.
 225          *
 226          *      Ian Heavens has since shown this is an inadequate fix for the protocol
 227          *      bug in question.
 228          */
 229         if(sk->state!=TCP_TIME_WAIT)
 230         {       
 231                 tcp_set_state(sk,TCP_CLOSE);
 232                 sk->shutdown = SHUTDOWN_MASK;
 233         }
 234 #else   
 235         tcp_set_state(sk,TCP_CLOSE);
 236         sk->shutdown = SHUTDOWN_MASK;
 237 #endif  
 238         if (!sk->dead) 
 239                 sk->state_change(sk);
 240         kfree_skb(skb, FREE_READ);
 241         return(0);
 242 }
 243 
 244 
 245 /*
 246  *      Look for tcp options. Parses everything but only knows about MSS.
 247  *      This routine is always called with the packet containing the SYN.
 248  *      However it may also be called with the ack to the SYN.  So you
 249  *      can't assume this is always the SYN.  It's always called after
 250  *      we have set up sk->mtu to our own MTU.
 251  *
 252  *      We need at minimum to add PAWS support here. Possibly large windows
 253  *      as Linux gets deployed on 100Mb/sec networks.
 254  */
 255  
 256 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
 257 {
 258         unsigned char *ptr;
 259         int length=(th->doff*4)-sizeof(struct tcphdr);
 260         int mss_seen = 0;
 261     
 262         ptr = (unsigned char *)(th + 1);
 263   
 264         while(length>0)
 265         {
 266                 int opcode=*ptr++;
 267                 int opsize=*ptr++;
 268                 switch(opcode)
 269                 {
 270                         case TCPOPT_EOL:
 271                                 return;
 272                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 273                                 length--;
 274                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
 275                                 continue;
 276                         
 277                         default:
 278                                 if(opsize<=2)   /* Avoid silly options looping forever */
 279                                         return;
 280                                 switch(opcode)
 281                                 {
 282                                         case TCPOPT_MSS:
 283                                                 if(opsize==4 && th->syn)
 284                                                 {
 285                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
 286                                                         mss_seen = 1;
 287                                                 }
 288                                                 break;
 289                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
 290                                 }
 291                                 ptr+=opsize-2;
 292                                 length-=opsize;
 293                 }
 294         }
 295         if (th->syn) 
 296         {
 297                 if (! mss_seen)
 298                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
 299         }
 300 #ifdef CONFIG_INET_PCTCP
 301         sk->mss = min(sk->max_window >> 1, sk->mtu);
 302 #else    
 303         sk->mss = min(sk->max_window, sk->mtu);
 304         sk->max_unacked = 2 * sk->mss;
 305 #endif  
 306 }
 307 
 308 
 309 /*
 310  *      This routine handles a connection request.
 311  *      It should make sure we haven't already responded.
 312  *      Because of the way BSD works, we have to send a syn/ack now.
 313  *      This also means it will be harder to close a socket which is
 314  *      listening.
 315  */
 316  
 317 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
 318                  u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
 319 {
 320         struct sock *newsk;
 321         struct tcphdr *th;
 322         struct rtable *rt;
 323   
 324         th = skb->h.th;
 325 
 326         /* If the socket is dead, don't accept the connection. */
 327         if (!sk->dead) 
 328         {
 329                 sk->data_ready(sk,0);
 330         }
 331         else 
 332         {
 333                 if(sk->debug)
 334                         printk("Reset on %p: Connect on dead socket.\n",sk);
 335                 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
 336                 tcp_statistics.TcpAttemptFails++;
 337                 kfree_skb(skb, FREE_READ);
 338                 return;
 339         }
 340 
 341         /*
 342          *      Make sure we can accept more.  This will prevent a
 343          *      flurry of syns from eating up all our memory.
 344          *
 345          *      BSD does some funnies here and allows 3/2 times the
 346          *      set backlog as a fudge factor. Thats just too gross.
 347          */
 348 
 349         if (sk->ack_backlog >= sk->max_ack_backlog) 
 350         {
 351                 tcp_statistics.TcpAttemptFails++;
 352                 kfree_skb(skb, FREE_READ);
 353                 return;
 354         }
 355 
 356         /*
 357          * We need to build a new sock struct.
 358          * It is sort of bad to have a socket without an inode attached
 359          * to it, but the wake_up's will just wake up the listening socket,
 360          * and if the listening socket is destroyed before this is taken
 361          * off of the queue, this will take care of it.
 362          */
 363 
 364         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
 365         if (newsk == NULL) 
 366         {
 367                 /* just ignore the syn.  It will get retransmitted. */
 368                 tcp_statistics.TcpAttemptFails++;
 369                 kfree_skb(skb, FREE_READ);
 370                 return;
 371         }
 372 
 373         memcpy(newsk, sk, sizeof(*newsk));
 374         newsk->opt = NULL;
 375         newsk->ip_route_cache  = NULL;
 376         if (opt && opt->optlen) 
 377         {
 378                 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
 379                 if (!sk->opt) 
 380                 {
 381                         kfree_s(newsk, sizeof(struct sock));
 382                         tcp_statistics.TcpAttemptFails++;
 383                         kfree_skb(skb, FREE_READ);
 384                         return;
 385                 }
 386                 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
 387                 {
 388                         kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
 389                         kfree_s(newsk, sizeof(struct sock));
 390                         tcp_statistics.TcpAttemptFails++;
 391                         kfree_skb(skb, FREE_READ);
 392                         return;
 393                 }
 394         }
 395         skb_queue_head_init(&newsk->write_queue);
 396         skb_queue_head_init(&newsk->receive_queue);
 397         newsk->send_head = NULL;
 398         newsk->send_tail = NULL;
 399         skb_queue_head_init(&newsk->back_log);
 400         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
 401         newsk->rto = TCP_TIMEOUT_INIT;
 402         newsk->mdev = 0;
 403         newsk->max_window = 0;
 404         newsk->cong_window = 1;
 405         newsk->cong_count = 0;
 406         newsk->ssthresh = 0;
 407         newsk->backoff = 0;
 408         newsk->blog = 0;
 409         newsk->intr = 0;
 410         newsk->proc = 0;
 411         newsk->done = 0;
 412         newsk->partial = NULL;
 413         newsk->pair = NULL;
 414         newsk->wmem_alloc = 0;
 415         newsk->rmem_alloc = 0;
 416         newsk->localroute = sk->localroute;
 417 
 418         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
 419 
 420         newsk->err = 0;
 421         newsk->shutdown = 0;
 422         newsk->ack_backlog = 0;
 423         newsk->acked_seq = skb->seq+1;
 424         newsk->lastwin_seq = skb->seq+1;
 425         newsk->delay_acks = 1;
 426         newsk->copied_seq = skb->seq+1;
 427         newsk->fin_seq = skb->seq;
 428         newsk->state = TCP_SYN_RECV;
 429         newsk->timeout = 0;
 430         newsk->ip_xmit_timeout = 0;
 431         newsk->write_seq = seq; 
 432         newsk->window_seq = newsk->write_seq;
 433         newsk->rcv_ack_seq = newsk->write_seq;
 434         newsk->urg_data = 0;
 435         newsk->retransmits = 0;
 436         newsk->linger=0;
 437         newsk->destroy = 0;
 438         init_timer(&newsk->timer);
 439         newsk->timer.data = (unsigned long)newsk;
 440         newsk->timer.function = &net_timer;
 441         init_timer(&newsk->retransmit_timer);
 442         newsk->retransmit_timer.data = (unsigned long)newsk;
 443         newsk->retransmit_timer.function=&tcp_retransmit_timer;
 444         newsk->dummy_th.source = skb->h.th->dest;
 445         newsk->dummy_th.dest = skb->h.th->source;
 446         
 447         /*
 448          *      Swap these two, they are from our point of view. 
 449          */
 450          
 451         newsk->daddr = saddr;
 452         newsk->saddr = daddr;
 453         newsk->rcv_saddr = daddr;
 454 
 455         put_sock(newsk->num,newsk);
 456         newsk->acked_seq = skb->seq + 1;
 457         newsk->copied_seq = skb->seq + 1;
 458         newsk->socket = NULL;
 459 
 460         /*
 461          *      Grab the ttl and tos values and use them 
 462          */
 463 
 464         newsk->ip_ttl=sk->ip_ttl;
 465         newsk->ip_tos=skb->ip_hdr->tos;
 466 
 467         /*
 468          *      Use 512 or whatever user asked for 
 469          */
 470 
 471         /*
 472          *      Note use of sk->user_mss, since user has no direct access to newsk 
 473          */
 474 
 475         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
 476         newsk->ip_route_cache = rt;
 477         
 478         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
 479                 newsk->window_clamp = rt->rt_window;
 480         else
 481                 newsk->window_clamp = 0;
 482                 
 483         if (sk->user_mss)
 484                 newsk->mtu = sk->user_mss;
 485         else if (rt)
 486                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 487         else 
 488                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
 489 
 490         /*
 491          *      But not bigger than device MTU 
 492          */
 493 
 494         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
 495 
 496 #ifdef CONFIG_SKIP
 497         
 498         /*
 499          *      SKIP devices set their MTU to 65535. This is so they can take packets
 500          *      unfragmented to security process then fragment. They could lie to the
 501          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
 502          *      simply because the final package we want unfragmented is going to be
 503          *
 504          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
 505          */
 506          
 507         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
 508                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
 509 #endif
 510         /*
 511          *      This will min with what arrived in the packet 
 512          */
 513 
 514         tcp_options(newsk,skb->h.th);
 515         
 516         tcp_cache_zap();
 517         tcp_send_synack(newsk, sk, skb);
 518 }
 519 
 520 
 521 /*
 522  * Handle a TCP window that shrunk on us. It shouldn't happen,
 523  * but..
 524  *
 525  * We may need to move packets from the send queue
 526  * to the write queue, if the window has been shrunk on us.
 527  * The RFC says you are not allowed to shrink your window
 528  * like this, but if the other end does, you must be able
 529  * to deal with it.
 530  */
 531 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
     /* [previous][next][first][last][top][bottom][index][help] */
 532 {
 533         struct sk_buff *skb;
 534         struct sk_buff *skb2;
 535         struct sk_buff *wskb = NULL;
 536         
 537         skb2 = sk->send_head;
 538         sk->send_head = NULL;
 539         sk->send_tail = NULL;
 540 
 541         /*
 542          *      This is an artifact of a flawed concept. We want one
 543          *      queue and a smarter send routine when we send all.
 544          */
 545         cli();
 546         while (skb2 != NULL) 
 547         {
 548                 skb = skb2;
 549                 skb2 = skb->link3;
 550                 skb->link3 = NULL;
 551                 if (after(skb->end_seq, window_seq)) 
 552                 {
 553                         if (sk->packets_out > 0) 
 554                                 sk->packets_out--;
 555                         /* We may need to remove this from the dev send list. */
 556                         if (skb->next != NULL) 
 557                         {
 558                                 skb_unlink(skb);                                
 559                         }
 560                         /* Now add it to the write_queue. */
 561                         if (wskb == NULL)
 562                                 skb_queue_head(&sk->write_queue,skb);
 563                         else
 564                                 skb_append(wskb,skb);
 565                         wskb = skb;
 566                 } 
 567                 else 
 568                 {
 569                         if (sk->send_head == NULL) 
 570                         {
 571                                 sk->send_head = skb;
 572                                 sk->send_tail = skb;
 573                         }
 574                         else
 575                         {
 576                                 sk->send_tail->link3 = skb;
 577                                 sk->send_tail = skb;
 578                         }
 579                         skb->link3 = NULL;
 580                 }
 581         }
 582         sti();
 583 }
 584 
 585 
 586 /*
 587  *      This routine deals with incoming acks, but not outgoing ones.
 588  *
 589  *      This routine is totally _WRONG_. The list structuring is wrong,
 590  *      the algorithm is wrong, the code is wrong.
 591  */
 592 
 593 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
 594 {
 595         int flag = 0;
 596         u32 window_seq;
 597 
 598         /* 
 599          * 1 - there was data in packet as well as ack or new data is sent or 
 600          *     in shutdown state
 601          * 2 - data from retransmit queue was acked and removed
 602          * 4 - window shrunk or data from retransmit queue was acked and removed
 603          */
 604 
 605         if(sk->zapped)
 606                 return(1);      /* Dead, cant ack any more so why bother */
 607 
 608         /*
 609          *      We have dropped back to keepalive timeouts. Thus we have
 610          *      no retransmits pending.
 611          */
 612          
 613         if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
 614                 sk->retransmits = 0;
 615 
 616         /*
 617          *      If the ack is newer than sent or older than previous acks
 618          *      then we can probably ignore it.
 619          */
 620          
 621         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
 622                 goto uninteresting_ack;
 623 
 624         /*
 625          *      If there is data set flag 1
 626          */
 627          
 628         if (len != th->doff*4) 
 629                 flag |= 1;
 630 
 631         /*
 632          *      Have we discovered a larger window
 633          */
 634         window_seq = ntohs(th->window);
 635         if (window_seq > sk->max_window) 
 636         {
 637                 sk->max_window = window_seq;
 638 #ifdef CONFIG_INET_PCTCP
 639                 /* Hack because we don't send partial packets to non SWS
 640                    handling hosts */
 641                 sk->mss = min(window_seq>>1, sk->mtu);
 642 #else
 643                 sk->mss = min(window_seq, sk->mtu);
 644 #endif  
 645         }
 646         window_seq += ack;
 647 
 648         /*
 649          *      See if our window has been shrunk. 
 650          */
 651         if (after(sk->window_seq, window_seq)) {
 652                 flag |= 4;
 653                 tcp_window_shrunk(sk, window_seq);
 654         }
 655 
 656         /*
 657          *      Update the right hand window edge of the host
 658          */
 659         sk->window_seq = window_seq;
 660 
 661         /*
 662          *      Pipe has emptied
 663          */      
 664         if (sk->send_tail == NULL || sk->send_head == NULL) 
 665         {
 666                 sk->send_head = NULL;
 667                 sk->send_tail = NULL;
 668                 sk->packets_out= 0;
 669         }
 670 
 671         /*
 672          *      We don't want too many packets out there. 
 673          */
 674          
 675         if (sk->ip_xmit_timeout == TIME_WRITE && 
 676                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
 677         {
 678                 
 679                 /* 
 680                  * This is Jacobson's slow start and congestion avoidance. 
 681                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
 682                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
 683                  * counter and increment it once every cwnd times.  It's possible
 684                  * that this should be done only if sk->retransmits == 0.  I'm
 685                  * interpreting "new data is acked" as including data that has
 686                  * been retransmitted but is just now being acked.
 687                  */
 688                 if (sk->cong_window < sk->ssthresh)  
 689                         /* 
 690                          *      In "safe" area, increase
 691                          */
 692                         sk->cong_window++;
 693                 else 
 694                 {
 695                         /*
 696                          *      In dangerous area, increase slowly.  In theory this is
 697                          *      sk->cong_window += 1 / sk->cong_window
 698                          */
 699                         if (sk->cong_count >= sk->cong_window) 
 700                         {
 701                                 sk->cong_window++;
 702                                 sk->cong_count = 0;
 703                         }
 704                         else 
 705                                 sk->cong_count++;
 706                 }
 707         }
 708 
 709         /*
 710          *      Remember the highest ack received.
 711          */
 712          
 713         sk->rcv_ack_seq = ack;
 714         
 715         /*
 716          *      We passed data and got it acked, remove any soft error
 717          *      log. Something worked...
 718          */
 719          
 720         sk->err_soft = 0;
 721 
 722         /*
 723          *      If this ack opens up a zero window, clear backoff.  It was
 724          *      being used to time the probes, and is probably far higher than
 725          *      it needs to be for normal retransmission.
 726          */
 727 
 728         if (sk->ip_xmit_timeout == TIME_PROBE0) 
 729         {
 730                 sk->retransmits = 0;    /* Our probe was answered */
 731                 
 732                 /*
 733                  *      Was it a usable window open ?
 734                  */
 735                  
 736                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
 737                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
 738                 {
 739                         sk->backoff = 0;
 740                         
 741                         /*
 742                          *      Recompute rto from rtt.  this eliminates any backoff.
 743                          */
 744 
 745                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 746                         if (sk->rto > 120*HZ)
 747                                 sk->rto = 120*HZ;
 748                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
 749                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
 750                                                    .2 of a second is going to need huge windows (SIGH) */
 751                         sk->rto = HZ/5;
 752                 }
 753         }
 754 
 755         /* 
 756          *      See if we can take anything off of the retransmit queue.
 757          */
 758    
 759         while(sk->send_head != NULL) 
 760         {
 761                 /* Check for a bug. */
 762                 if (sk->send_head->link3 &&
 763                     after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) 
 764                         printk("INET: tcp.c: *** bug send_list out of order.\n");
 765                         
 766                 /*
 767                  *      If our packet is before the ack sequence we can
 768                  *      discard it as it's confirmed to have arrived the other end.
 769                  */
 770                  
 771                 if (before(sk->send_head->end_seq, ack+1)) 
 772                 {
 773                         struct sk_buff *oskb;   
 774                         if (sk->retransmits) 
 775                         {       
 776                                 /*
 777                                  *      We were retransmitting.  don't count this in RTT est 
 778                                  */
 779                                 flag |= 2;
 780 
 781                                 /*
 782                                  * even though we've gotten an ack, we're still
 783                                  * retransmitting as long as we're sending from
 784                                  * the retransmit queue.  Keeping retransmits non-zero
 785                                  * prevents us from getting new data interspersed with
 786                                  * retransmissions.
 787                                  */
 788 
 789                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
 790                                         sk->retransmits = 1;
 791                                 else
 792                                         sk->retransmits = 0;
 793                         }
 794                         /*
 795                          * Note that we only reset backoff and rto in the
 796                          * rtt recomputation code.  And that doesn't happen
 797                          * if there were retransmissions in effect.  So the
 798                          * first new packet after the retransmissions is
 799                          * sent with the backoff still in effect.  Not until
 800                          * we get an ack from a non-retransmitted packet do
 801                          * we reset the backoff and rto.  This allows us to deal
 802                          * with a situation where the network delay has increased
 803                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 804                          */
 805 
 806                         /*
 807                          *      We have one less packet out there. 
 808                          */
 809                          
 810                         if (sk->packets_out > 0) 
 811                                 sk->packets_out --;
 812 
 813                         oskb = sk->send_head;
 814 
 815                         if (!(flag&2))  /* Not retransmitting */
 816                                 tcp_rtt_estimator(sk,oskb);
 817                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
 818                                            In this case as we just set it up */
 819                         cli();
 820                         oskb = sk->send_head;
 821                         IS_SKB(oskb);
 822                         sk->send_head = oskb->link3;
 823                         if (sk->send_head == NULL) 
 824                         {
 825                                 sk->send_tail = NULL;
 826                         }
 827 
 828                 /*
 829                  *      We may need to remove this from the dev send list. 
 830                  */
 831 
 832                         if (oskb->next)
 833                                 skb_unlink(oskb);
 834                         sti();
 835                         kfree_skb(oskb, FREE_WRITE); /* write. */
 836                         if (!sk->dead)
 837                                 sk->write_space(sk);
 838                 }
 839                 else
 840                 {
 841                         break;
 842                 }
 843         }
 844 
 845         /*
 846          * XXX someone ought to look at this too.. at the moment, if skb_peek()
 847          * returns non-NULL, we complete ignore the timer stuff in the else
 848          * clause.  We ought to organize the code so that else clause can
 849          * (should) be executed regardless, possibly moving the PROBE timer
 850          * reset over.  The skb_peek() thing should only move stuff to the
 851          * write queue, NOT also manage the timer functions.
 852          */
 853 
 854         /*
 855          * Maybe we can take some stuff off of the write queue,
 856          * and put it onto the xmit queue.
 857          */
 858         if (skb_peek(&sk->write_queue) != NULL) 
 859         {
 860                 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
 861                         (sk->retransmits == 0 || 
 862                          sk->ip_xmit_timeout != TIME_WRITE ||
 863                          !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
 864                         && sk->packets_out < sk->cong_window) 
 865                 {
 866                         /*
 867                          *      Add more data to the send queue.
 868                          */
 869                         flag |= 1;
 870                         tcp_write_xmit(sk);
 871                 }
 872                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
 873                         sk->send_head == NULL &&
 874                         sk->ack_backlog == 0 &&
 875                         sk->state != TCP_TIME_WAIT) 
 876                 {
 877                         /*
 878                          *      Data to queue but no room.
 879                          */
 880                         tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
 881                 }               
 882         }
 883         else
 884         {
 885                 /*
 886                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
 887                  * from TCP_CLOSE we don't do anything
 888                  *
 889                  * from anything else, if there is write data (or fin) pending,
 890                  * we use a TIME_WRITE timeout, else if keepalive we reset to
 891                  * a KEEPALIVE timeout, else we delete the timer.
 892                  *
 893                  * We do not set flag for nominal write data, otherwise we may
 894                  * force a state where we start to write itsy bitsy tidbits
 895                  * of data.
 896                  */
 897 
 898                 switch(sk->state) {
 899                 case TCP_TIME_WAIT:
 900                         /*
 901                          * keep us in TIME_WAIT until we stop getting packets,
 902                          * reset the timeout.
 903                          */
 904                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 905                         break;
 906                 case TCP_CLOSE:
 907                         /*
 908                          * don't touch the timer.
 909                          */
 910                         break;
 911                 default:
 912                         /*
 913                          *      Must check send_head, write_queue, and ack_backlog
 914                          *      to determine which timeout to use.
 915                          */
 916                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
 917                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 918                         } else if (sk->keepopen) {
 919                                 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 920                         } else {
 921                                 del_timer(&sk->retransmit_timer);
 922                                 sk->ip_xmit_timeout = 0;
 923                         }
 924                         break;
 925                 }
 926         }
 927 
 928         /*
 929          *      We have nothing queued but space to send. Send any partial
 930          *      packets immediately (end of Nagle rule application).
 931          */
 932          
 933         if (sk->packets_out == 0 && sk->partial != NULL &&
 934                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
 935         {
 936                 flag |= 1;
 937                 tcp_send_partial(sk);
 938         }
 939 
 940         /*
 941          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
 942          * we are now waiting for an acknowledge to our FIN.  The other end is
 943          * already in TIME_WAIT.
 944          *
 945          * Move to TCP_CLOSE on success.
 946          */
 947 
 948         if (sk->state == TCP_LAST_ACK) 
 949         {
 950                 if (!sk->dead)
 951                         sk->state_change(sk);
 952                 if(sk->debug)
 953                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
 954                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
 955                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
 956                 {
 957                         flag |= 1;
 958                         sk->shutdown = SHUTDOWN_MASK;
 959                         tcp_set_state(sk,TCP_CLOSE);
 960                         return 1;
 961                 }
 962         }
 963 
 964         /*
 965          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
 966          *
 967          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
 968          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
 969          */
 970 
 971         if (sk->state == TCP_FIN_WAIT1) 
 972         {
 973 
 974                 if (!sk->dead) 
 975                         sk->state_change(sk);
 976                 if (sk->rcv_ack_seq == sk->write_seq) 
 977                 {
 978                         flag |= 1;
 979                         sk->shutdown |= SEND_SHUTDOWN;
 980                         tcp_set_state(sk, TCP_FIN_WAIT2);
 981                 }
 982         }
 983 
 984         /*
 985          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
 986          *
 987          *      Move to TIME_WAIT
 988          */
 989 
 990         if (sk->state == TCP_CLOSING) 
 991         {
 992 
 993                 if (!sk->dead) 
 994                         sk->state_change(sk);
 995                 if (sk->rcv_ack_seq == sk->write_seq) 
 996                 {
 997                         flag |= 1;
 998                         tcp_time_wait(sk);
 999                 }
1000         }
1001         
1002         /*
1003          *      Final ack of a three way shake 
1004          */
1005          
1006         if(sk->state==TCP_SYN_RECV)
1007         {
1008                 tcp_set_state(sk, TCP_ESTABLISHED);
1009                 tcp_options(sk,th);
1010                 sk->dummy_th.dest=th->source;
1011                 sk->copied_seq = sk->acked_seq;
1012                 if(!sk->dead)
1013                         sk->state_change(sk);
1014                 if(sk->max_window==0)
1015                 {
1016                         sk->max_window=32;      /* Sanity check */
1017                         sk->mss=min(sk->max_window,sk->mtu);
1018                 }
1019         }
1020         
1021         /*
1022          * I make no guarantees about the first clause in the following
1023          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
1024          * what conditions "!flag" would be true.  However I think the rest
1025          * of the conditions would prevent that from causing any
1026          * unnecessary retransmission. 
1027          *   Clearly if the first packet has expired it should be 
1028          * retransmitted.  The other alternative, "flag&2 && retransmits", is
1029          * harder to explain:  You have to look carefully at how and when the
1030          * timer is set and with what timeout.  The most recent transmission always
1031          * sets the timer.  So in general if the most recent thing has timed
1032          * out, everything before it has as well.  So we want to go ahead and
1033          * retransmit some more.  If we didn't explicitly test for this
1034          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1035          * would not be true.  If you look at the pattern of timing, you can
1036          * show that rto is increased fast enough that the next packet would
1037          * almost never be retransmitted immediately.  Then you'd end up
1038          * waiting for a timeout to send each packet on the retransmission
1039          * queue.  With my implementation of the Karn sampling algorithm,
1040          * the timeout would double each time.  The net result is that it would
1041          * take a hideous amount of time to recover from a single dropped packet.
1042          * It's possible that there should also be a test for TIME_WRITE, but
1043          * I think as long as "send_head != NULL" and "retransmit" is on, we've
1044          * got to be in real retransmission mode.
1045          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
1046          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1047          * As long as no further losses occur, this seems reasonable.
1048          */
1049         
1050         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1051                (((flag&2) && sk->retransmits) ||
1052                (sk->send_head->when + sk->rto < jiffies))) 
1053         {
1054                 if(sk->send_head->when + sk->rto < jiffies)
1055                         tcp_retransmit(sk,0);   
1056                 else
1057                 {
1058                         tcp_do_retransmit(sk, 1);
1059                         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1060                 }
1061         }
1062 
1063         return 1;
1064 
1065 uninteresting_ack:
1066         if(sk->debug)
1067                 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1068                         
1069         /*
1070          *      Keepalive processing.
1071          */
1072                  
1073         if (after(ack, sk->sent_seq)) 
1074         {
1075                 return 0;
1076         }
1077                 
1078         /*
1079          *      Restart the keepalive timer.
1080          */
1081                  
1082         if (sk->keepopen) 
1083         {
1084                 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1085                         tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1086         }
1087         return 1;
1088 }
1089 
1090 
1091 /*
1092  *      Process the FIN bit. This now behaves as it is supposed to work
1093  *      and the FIN takes effect when it is validly part of sequence
1094  *      space. Not before when we get holes.
1095  *
1096  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1097  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1098  *      TIME-WAIT)
1099  *
1100  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1101  *      close and we go into CLOSING (and later onto TIME-WAIT)
1102  *
1103  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1104  *
1105  */
1106  
1107 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
1108 {
1109         sk->fin_seq = skb->end_seq;
1110 
1111         if (!sk->dead) 
1112         {
1113                 sk->state_change(sk);
1114                 sock_wake_async(sk->socket, 1);
1115         }
1116 
1117         switch(sk->state) 
1118         {
1119                 case TCP_SYN_RECV:
1120                 case TCP_SYN_SENT:
1121                 case TCP_ESTABLISHED:
1122                         /*
1123                          * move to CLOSE_WAIT, tcp_data() already handled
1124                          * sending the ack.
1125                          */
1126                         tcp_set_state(sk,TCP_CLOSE_WAIT);
1127                         if (th->rst)
1128                                 sk->shutdown = SHUTDOWN_MASK;
1129                         break;
1130 
1131                 case TCP_CLOSE_WAIT:
1132                 case TCP_CLOSING:
1133                         /*
1134                          * received a retransmission of the FIN, do
1135                          * nothing.
1136                          */
1137                         break;
1138                 case TCP_TIME_WAIT:
1139                         /*
1140                          * received a retransmission of the FIN,
1141                          * restart the TIME_WAIT timer.
1142                          */
1143                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1144                         return(0);
1145                 case TCP_FIN_WAIT1:
1146                         /*
1147                          * This case occurs when a simultaneous close
1148                          * happens, we must ack the received FIN and
1149                          * enter the CLOSING state.
1150                          *
1151                          * This causes a WRITE timeout, which will either
1152                          * move on to TIME_WAIT when we timeout, or resend
1153                          * the FIN properly (maybe we get rid of that annoying
1154                          * FIN lost hang). The TIME_WRITE code is already correct
1155                          * for handling this timeout.
1156                          */
1157 
1158                         if(sk->ip_xmit_timeout != TIME_WRITE)
1159                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1160                         tcp_set_state(sk,TCP_CLOSING);
1161                         break;
1162                 case TCP_FIN_WAIT2:
1163                         /*
1164                          * received a FIN -- send ACK and enter TIME_WAIT
1165                          */
1166                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1167                         sk->shutdown|=SHUTDOWN_MASK;
1168                         tcp_set_state(sk,TCP_TIME_WAIT);
1169                         break;
1170                 case TCP_CLOSE:
1171                         /*
1172                          * already in CLOSE
1173                          */
1174                         break;
1175                 default:
1176                         tcp_set_state(sk,TCP_LAST_ACK);
1177         
1178                         /* Start the timers. */
1179                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1180                         return(0);
1181         }
1182 
1183         return(0);
1184 }
1185 
1186 /*
1187  * Add a sk_buff to the TCP receive queue, calculating
1188  * the ACK sequence as we go..
1189  */
1190 static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list)
     /* [previous][next][first][last][top][bottom][index][help] */
1191 {
1192         struct sk_buff * prev, * next;
1193         u32 seq;
1194 
1195         /*
1196          * Find where the new skb goes.. (This goes backwards,
1197          * on the assumption that we get the packets in order)
1198          */
1199         seq = skb->seq;
1200         prev = list->prev;
1201         next = (struct sk_buff *) list;
1202         for (;;) {
1203                 if (prev == (struct sk_buff *) list || !after(prev->seq, seq))
1204                         break;
1205                 next = prev;
1206                 prev = prev->prev;
1207         }
1208         __skb_insert(skb, prev, next, list);
1209 }
1210 
1211 /*
1212  * Called for each packet when we find a new ACK endpoint sequence in it
1213  */
1214 static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1215 {
1216         /*
1217          *      When we ack the fin, we do the FIN 
1218          *      processing.
1219          */
1220         skb->acked = 1;
1221         if (skb->h.th->fin)
1222                 tcp_fin(skb,sk,skb->h.th);
1223         return skb->end_seq;
1224 }       
1225 
1226 static void tcp_queue(struct sk_buff * skb, struct sock * sk,
     /* [previous][next][first][last][top][bottom][index][help] */
1227         struct tcphdr *th, unsigned long saddr)
1228 {
1229         u32 ack_seq;
1230 
1231         tcp_insert_skb(skb, &sk->receive_queue);
1232         /*
1233          * Did we get anything new to ack?
1234          */
1235         ack_seq = sk->acked_seq;
1236         if (!after(skb->seq, ack_seq) && after(skb->end_seq, ack_seq)) {
1237                 struct sk_buff_head * list = &sk->receive_queue;
1238                 struct sk_buff * next;
1239                 ack_seq = tcp_queue_ack(skb, sk);
1240 
1241                 /*
1242                  * Do we have any old packets to ack that the above
1243                  * made visible? (Go forward from skb)
1244                  */
1245                 next = skb->next;
1246                 while (next != (struct sk_buff *) list) {
1247                         if (after(next->seq, ack_seq))
1248                                 break;
1249                         if (after(next->end_seq, ack_seq))
1250                                 ack_seq = tcp_queue_ack(next, sk);
1251                         next = next->next;
1252                 }
1253 
1254                 /*
1255                  * Ok, we found new data, update acked_seq as
1256                  * necessary (and possibly send the actual
1257                  * ACK packet).
1258                  */
1259                 sk->acked_seq = ack_seq;
1260 
1261                 /*
1262                  *      rules for delaying an ack:
1263                  *      - delay time <= 0.5 HZ
1264                  *      - must send at least every 2 full sized packets
1265                  *      - we don't have a window update to send
1266                  *
1267                  * We handle the window update in the actual read
1268                  * side, so we only have to worry about the first two.
1269                  */
1270                 if (!sk->delay_acks || th->fin) {
1271                         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1272                 }
1273                 else
1274                 {
1275                         int timeout = sk->ato;
1276                         if (timeout > HZ/2)
1277                                 timeout = HZ/2;
1278                         if (sk->bytes_rcv > sk->max_unacked) {
1279                                 timeout = 0;
1280                                 mark_bh(TIMER_BH);
1281                         }
1282                         sk->ack_backlog++;
1283                         if(sk->debug)
1284                                 printk("Ack queued.\n");
1285                         tcp_reset_xmit_timer(sk, TIME_WRITE, timeout);
1286                 }               
1287         }
1288 }
1289 
1290 
1291 /*
1292  *      This routine handles the data.  If there is room in the buffer,
1293  *      it will be have already been moved into it.  If there is no
1294  *      room, then we will just have to discard the packet.
1295  */
1296 
1297 static int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
1298          unsigned long saddr, unsigned short len)
1299 {
1300         struct tcphdr *th;
1301         u32 new_seq, shut_seq;
1302 
1303         th = skb->h.th;
1304         skb_pull(skb,th->doff*4);
1305         skb_trim(skb,len-(th->doff*4));
1306 
1307         /*
1308          *      The bytes in the receive read/assembly queue has increased. Needed for the
1309          *      low memory discard algorithm 
1310          */
1311            
1312         sk->bytes_rcv += skb->len;
1313         
1314         if (skb->len == 0 && !th->fin) 
1315         {
1316                 /* 
1317                  *      Don't want to keep passing ack's back and forth. 
1318                  *      (someone sent us dataless, boring frame)
1319                  */
1320                 if (!th->ack)
1321                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1322                 kfree_skb(skb, FREE_READ);
1323                 return(0);
1324         }
1325         
1326         /*
1327          *      We no longer have anyone receiving data on this connection.
1328          */
1329 
1330 #ifndef TCP_DONT_RST_SHUTDOWN            
1331 
1332         if(sk->shutdown & RCV_SHUTDOWN)
1333         {
1334                 /*
1335                  *      FIXME: BSD has some magic to avoid sending resets to
1336                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
1337                  *      BSD stacks still have broken keepalives so we want to
1338                  *      cope with it.
1339                  */
1340 
1341                 if(skb->len)    /* We don't care if it's just an ack or
1342                                    a keepalive/window probe */
1343                 {
1344                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
1345                         
1346                         /* Do this the way 4.4BSD treats it. Not what I'd
1347                            regard as the meaning of the spec but it's what BSD
1348                            does and clearly they know everything 8) */
1349 
1350                         /*
1351                          *      This is valid because of two things
1352                          *
1353                          *      a) The way tcp_data behaves at the bottom.
1354                          *      b) A fin takes effect when read not when received.
1355                          */
1356                          
1357                         shut_seq = sk->acked_seq+1;     /* Last byte */
1358                         
1359                         if(after(new_seq,shut_seq))
1360                         {
1361                                 if(sk->debug)
1362                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1363                                                 sk, new_seq, shut_seq, sk->blog);
1364                                 if(sk->dead)
1365                                 {
1366                                         sk->acked_seq = new_seq + th->fin;
1367                                         tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1368                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1369                                         tcp_statistics.TcpEstabResets++;
1370                                         sk->err = EPIPE;
1371                                         sk->error_report(sk);
1372                                         sk->shutdown = SHUTDOWN_MASK;
1373                                         tcp_set_state(sk,TCP_CLOSE);
1374                                         kfree_skb(skb, FREE_READ);
1375                                         return 0;
1376                                 }
1377                         }
1378                 }
1379         }
1380 
1381 #endif
1382 
1383         tcp_queue(skb, sk, th, saddr);
1384 
1385         /*
1386          *      If we've missed a packet, send an ack.
1387          *      Also start a timer to send another.
1388          */
1389          
1390         if (!skb->acked) 
1391         {
1392                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1393                 sk->ack_backlog++;
1394                 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1395         }
1396 
1397         /*
1398          *      Now tell the user we may have some data. 
1399          */
1400          
1401         if (!sk->dead) 
1402         {
1403                 if(sk->debug)
1404                         printk("Data wakeup.\n");
1405                 sk->data_ready(sk,0);
1406         } 
1407         return(0);
1408 }
1409 
1410 
1411 /*
1412  *      This routine is only called when we have urgent data
1413  *      signalled. Its the 'slow' part of tcp_urg. It could be
1414  *      moved inline now as tcp_urg is only called from one
1415  *      place. We handle URGent data wrong. We have to - as
1416  *      BSD still doesn't use the correction from RFC961.
1417  */
1418  
1419 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
1420 {
1421         u32 ptr = ntohs(th->urg_ptr);
1422 
1423         if (ptr)
1424                 ptr--;
1425         ptr += ntohl(th->seq);
1426 
1427         /* ignore urgent data that we've already seen and read */
1428         if (after(sk->copied_seq, ptr))
1429                 return;
1430 
1431         /* do we already have a newer (or duplicate) urgent pointer? */
1432         if (sk->urg_data && !after(ptr, sk->urg_seq))
1433                 return;
1434 
1435         /* tell the world about our new urgent pointer */
1436         if (sk->proc != 0) {
1437                 if (sk->proc > 0) {
1438                         kill_proc(sk->proc, SIGURG, 1);
1439                 } else {
1440                         kill_pg(-sk->proc, SIGURG, 1);
1441                 }
1442         }
1443         sk->urg_data = URG_NOTYET;
1444         sk->urg_seq = ptr;
1445 }
1446 
1447 /*
1448  *      This is the 'fast' part of urgent handling.
1449  */
1450  
1451 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
     /* [previous][next][first][last][top][bottom][index][help] */
1452 {
1453         /*
1454          *      Check if we get a new urgent pointer - normally not 
1455          */
1456          
1457         if (th->urg)
1458                 tcp_check_urg(sk,th);
1459 
1460         /*
1461          *      Do we wait for any urgent data? - normally not
1462          */
1463          
1464         if (sk->urg_data == URG_NOTYET) {
1465                 u32 ptr;
1466 
1467                 /*
1468                  *      Is the urgent pointer pointing into this packet? 
1469                  */      
1470                 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1471                 if (ptr < len) {
1472                         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1473                         if (!sk->dead)
1474                                 sk->data_ready(sk,0);
1475                 }
1476         }
1477 }
1478 
1479 /*
1480  * This should be a bit smarter and remove partially
1481  * overlapping stuff too, but this should be good
1482  * enough for any even remotely normal case (and the
1483  * worst that can happen is that we have a few
1484  * unnecessary packets in the receive queue).
1485  *
1486  * This function is never called with an empty list..
1487  */
1488 static inline void tcp_remove_dups(struct sk_buff_head * list)
     /* [previous][next][first][last][top][bottom][index][help] */
1489 {
1490         struct sk_buff * next = list->next;
1491 
1492         for (;;) {
1493                 struct sk_buff * skb = next;
1494                 next = next->next;
1495                 if (next == (struct sk_buff *) list)
1496                         break;
1497                 if (before(next->end_seq, skb->end_seq)) {
1498                         __skb_unlink(next, list);
1499                         kfree_skb(next, FREE_READ);
1500                         next = skb;
1501                         continue;
1502                 }
1503                 if (next->seq != skb->seq)
1504                         continue;
1505                 __skb_unlink(skb, list);
1506                 kfree_skb(skb, FREE_READ);
1507         }
1508 }
1509 
1510 /*
1511  * Throw out all unnecessary packets: we've gone over the
1512  * receive queue limit. This shouldn't happen in a normal
1513  * TCP connection, but we might have gotten duplicates etc.
1514  */
1515 static void prune_queue(struct sk_buff_head * list)
     /* [previous][next][first][last][top][bottom][index][help] */
1516 {
1517         for (;;) {
1518                 struct sk_buff * skb = list->prev;
1519 
1520                 /* gone through it all? */
1521                 if (skb == (struct sk_buff *) list)
1522                         break;
1523                 if (!skb->acked) {
1524                         __skb_unlink(skb, list);
1525                         kfree_skb(skb, FREE_READ);
1526                         continue;
1527                 }
1528                 tcp_remove_dups(list);
1529                 break;
1530         }
1531 }
1532 
1533 /*
1534  *      A TCP packet has arrived.
1535  *              skb->h.raw is the TCP header.
1536  */
1537  
1538 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
1539         __u32 daddr, unsigned short len,
1540         __u32 saddr, int redo, struct inet_protocol * protocol)
1541 {
1542         struct tcphdr *th;
1543         struct sock *sk;
1544         int syn_ok=0;
1545 
1546         /*
1547          * "redo" is 1 if we have already seen this skb but couldn't
1548          * use it at that time (the socket was locked).  In that case
1549          * we have already done a lot of the work (looked up the socket
1550          * etc).
1551          */
1552         th = skb->h.th;
1553         sk = skb->sk;
1554         if (!redo) {
1555                 tcp_statistics.TcpInSegs++;
1556                 if (skb->pkt_type!=PACKET_HOST)
1557                         goto discard_it;
1558 
1559                 /*
1560                  *      Pull up the IP header.
1561                  */
1562         
1563                 skb_pull(skb, skb->h.raw-skb->data);
1564 
1565                 /*
1566                  *      Try to use the device checksum if provided.
1567                  */
1568                 switch (skb->ip_summed) 
1569                 {
1570                         case CHECKSUM_NONE:
1571                                 skb->csum = csum_partial((char *)th, len, 0);
1572                         case CHECKSUM_HW:
1573                                 if (tcp_check(th, len, saddr, daddr, skb->csum))
1574                                         goto discard_it;
1575                         default:
1576                                 /* CHECKSUM_UNNECESSARY */
1577                 }
1578                 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1579                 if (!sk)
1580                         goto no_tcp_socket;
1581                 skb->sk = sk;
1582                 skb->seq = ntohl(th->seq);
1583                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1584                 skb->ack_seq = ntohl(th->ack_seq);
1585 
1586                 skb->acked = 0;
1587                 skb->used = 0;
1588                 skb->free = 1;
1589                 skb->saddr = daddr;
1590                 skb->daddr = saddr;
1591 
1592                 /*
1593                  * We may need to add it to the backlog here. 
1594                  */
1595                 if (sk->users) 
1596                 {
1597                         __skb_queue_tail(&sk->back_log, skb);
1598                         return(0);
1599                 }
1600         }
1601 
1602         /*
1603          *      If this socket has got a reset it's to all intents and purposes 
1604          *      really dead. Count closed sockets as dead.
1605          *
1606          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1607          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
1608          *      exist so should cause resets as if the port was unreachable.
1609          */
1610 
1611         if (sk->zapped || sk->state==TCP_CLOSE)
1612                 goto no_tcp_socket;
1613 
1614         if (!sk->prot) 
1615         {
1616                 printk("IMPOSSIBLE 3\n");
1617                 return(0);
1618         }
1619 
1620 
1621         /*
1622          *      Charge the memory to the socket. 
1623          */
1624          
1625         skb->sk=sk;
1626         atomic_add(skb->truesize, &sk->rmem_alloc);
1627         
1628         /*
1629          *      We should now do header prediction.
1630          */
1631          
1632         /*
1633          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1634          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1635          *      compatibility. We also set up variables more thoroughly [Karn notes in the
1636          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1637          */
1638 
1639         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
1640         {
1641         
1642                 /*
1643                  *      Now deal with unusual cases.
1644                  */
1645          
1646                 if(sk->state==TCP_LISTEN)
1647                 {
1648                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
1649                                 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1650 
1651                         /*
1652                          *      We don't care for RST, and non SYN are absorbed (old segments)
1653                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1654                          *      netmask on a running connection it can go broadcast. Even Sun's have
1655                          *      this problem so I'm ignoring it 
1656                          */
1657                            
1658                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1659                         {
1660                                 kfree_skb(skb, FREE_READ);
1661                                 return 0;
1662                         }
1663                 
1664                         /*      
1665                          *      Guess we need to make a new socket up 
1666                          */
1667                 
1668                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1669                 
1670                         /*
1671                          *      Now we have several options: In theory there is nothing else
1672                          *      in the frame. KA9Q has an option to send data with the syn,
1673                          *      BSD accepts data with the syn up to the [to be] advertised window
1674                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
1675                          *      it, that fits the spec precisely and avoids incompatibilities. It
1676                          *      would be nice in future to drop through and process the data.
1677                          *
1678                          *      Now TTCP is starting to use we ought to queue this data.
1679                          */
1680                          
1681                         return 0;
1682                 }
1683         
1684                 /* 
1685                  *      Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1686                  *      then its a new connection
1687                  */
1688                  
1689                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1690                 {
1691                         kfree_skb(skb, FREE_READ);
1692                         return 0;
1693                 }
1694                 
1695                 /*
1696                  *      SYN sent means we have to look for a suitable ack and either reset
1697                  *      for bad matches or go to connected. The SYN_SENT case is unusual and should
1698                  *      not be in line code. [AC]
1699                  */
1700            
1701                 if(sk->state==TCP_SYN_SENT)
1702                 {
1703                         /* Crossed SYN or previous junk segment */
1704                         if(th->ack)
1705                         {
1706                                 /* We got an ack, but it's not a good ack */
1707                                 if(!tcp_ack(sk,th,skb->ack_seq,len))
1708                                 {
1709                                         /* Reset the ack - its an ack from a 
1710                                            different connection  [ th->rst is checked in tcp_send_reset()] */
1711                                         tcp_statistics.TcpAttemptFails++;
1712                                         tcp_send_reset(daddr, saddr, th,
1713                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1714                                         kfree_skb(skb, FREE_READ);
1715                                         return(0);
1716                                 }
1717                                 if(th->rst)
1718                                         return tcp_reset(sk,skb);
1719                                 if(!th->syn)
1720                                 {
1721                                         /* A valid ack from a different connection
1722                                            start. Shouldn't happen but cover it */
1723                                         tcp_statistics.TcpAttemptFails++;
1724                                         tcp_send_reset(daddr, saddr, th,
1725                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1726                                         kfree_skb(skb, FREE_READ);
1727                                         return 0;
1728                                 }
1729                                 /*
1730                                  *      Ok.. it's good. Set up sequence numbers and
1731                                  *      move to established.
1732                                  */
1733                                 syn_ok=1;       /* Don't reset this connection for the syn */
1734                                 sk->acked_seq = skb->seq+1;
1735                                 sk->lastwin_seq = skb->seq+1;
1736                                 sk->fin_seq = skb->seq;
1737                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1738                                 tcp_set_state(sk, TCP_ESTABLISHED);
1739                                 tcp_options(sk,th);
1740                                 sk->dummy_th.dest=th->source;
1741                                 sk->copied_seq = sk->acked_seq;
1742                                 if(!sk->dead)
1743                                 {
1744                                         sk->state_change(sk);
1745                                         sock_wake_async(sk->socket, 0);
1746                                 }
1747                                 if(sk->max_window==0)
1748                                 {
1749                                         sk->max_window = 32;
1750                                         sk->mss = min(sk->max_window, sk->mtu);
1751                                 }
1752                         }
1753                         else
1754                         {
1755                                 /* See if SYN's cross. Drop if boring */
1756                                 if(th->syn && !th->rst)
1757                                 {
1758                                         /* Crossed SYN's are fine - but talking to
1759                                            yourself is right out... */
1760                                         if(sk->saddr==saddr && sk->daddr==daddr &&
1761                                                 sk->dummy_th.source==th->source &&
1762                                                 sk->dummy_th.dest==th->dest)
1763                                         {
1764                                                 tcp_statistics.TcpAttemptFails++;
1765                                                 return tcp_reset(sk,skb);
1766                                         }
1767                                         tcp_set_state(sk,TCP_SYN_RECV);
1768                                         
1769                                         /*
1770                                          *      FIXME:
1771                                          *      Must send SYN|ACK here
1772                                          */
1773                                 }               
1774                                 /* Discard junk segment */
1775                                 kfree_skb(skb, FREE_READ);
1776                                 return 0;
1777                         }
1778                         /*
1779                          *      SYN_RECV with data maybe.. drop through
1780                          */
1781                         goto rfc_step6;
1782                 }
1783 
1784         /*
1785          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1786          *      a more complex suggestion for fixing these reuse issues in RFC1644
1787          *      but not yet ready for general use. Also see RFC1379.
1788          *
1789          *      Note the funny way we go back to the top of this function for
1790          *      this case ("goto try_next_socket").  That also takes care of
1791          *      checking "sk->users" for the new socket as well as doing all
1792          *      the normal tests on the packet.
1793          */
1794         
1795 #define BSD_TIME_WAIT
1796 #ifdef BSD_TIME_WAIT
1797                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
1798                         after(skb->seq, sk->acked_seq) && !th->rst)
1799                 {
1800                         u32 seq = sk->write_seq;
1801                         if(sk->debug)
1802                                 printk("Doing a BSD time wait\n");
1803                         tcp_statistics.TcpEstabResets++;           
1804                         atomic_sub(skb->truesize, &sk->rmem_alloc);
1805                         skb->sk = NULL;
1806                         sk->err=ECONNRESET;
1807                         tcp_set_state(sk, TCP_CLOSE);
1808                         sk->shutdown = SHUTDOWN_MASK;
1809                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1810                         /* this is not really correct: we should check sk->users */
1811                         if (sk && sk->state==TCP_LISTEN)
1812                         {
1813                                 skb->sk = sk;
1814                                 atomic_add(skb->truesize, &sk->rmem_alloc);
1815                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1816                                 return 0;
1817                         }
1818                         kfree_skb(skb, FREE_READ);
1819                         return 0;
1820                 }
1821 #endif  
1822         }
1823 
1824         /*
1825          *      We are now in normal data flow (see the step list in the RFC)
1826          *      Note most of these are inline now. I'll inline the lot when
1827          *      I have time to test it hard and look at what gcc outputs 
1828          */
1829         
1830         if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1831         {
1832                 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1833                 kfree_skb(skb, FREE_READ);
1834                 return 0;
1835         }
1836 
1837         if(th->rst)
1838                 return tcp_reset(sk,skb);
1839         
1840         /*
1841          *      !syn_ok is effectively the state test in RFC793.
1842          */
1843          
1844         if(th->syn && !syn_ok)
1845         {
1846                 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1847                 return tcp_reset(sk,skb);       
1848         }
1849 
1850         tcp_delack_estimator(sk);
1851         
1852         /*
1853          *      Process the ACK
1854          */
1855          
1856 
1857         if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1858         {
1859                 /*
1860                  *      Our three way handshake failed.
1861                  */
1862                  
1863                 if(sk->state==TCP_SYN_RECV)
1864                 {
1865                         tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1866                 }
1867                 kfree_skb(skb, FREE_READ);
1868                 return 0;
1869         }
1870         
1871 rfc_step6:              /* I'll clean this up later */
1872 
1873         /*
1874          *      If the accepted buffer put us over our queue size we
1875          *      now drop it (we must process the ack first to avoid
1876          *      deadlock cases).
1877          */
1878 
1879         /*
1880          *      Process urgent data
1881          */
1882                 
1883         tcp_urg(sk, th, len);
1884         
1885         /*
1886          *      Process the encapsulated data
1887          */
1888         
1889         if(tcp_data(skb,sk, saddr, len))
1890                 kfree_skb(skb, FREE_READ);
1891 
1892         /*
1893          *      If our receive queue has grown past its limits,
1894          *      try to prune away duplicates etc..
1895          */
1896         if (sk->rmem_alloc > sk->rcvbuf)
1897                 prune_queue(&sk->receive_queue);
1898 
1899         /*
1900          *      And done
1901          */     
1902         
1903         return 0;
1904 
1905 no_tcp_socket:
1906         /*
1907          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1908          */
1909         tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1910 
1911 discard_it:
1912         /*
1913          *      Discard frame
1914          */
1915         skb->sk = NULL;
1916         kfree_skb(skb, FREE_READ);
1917         return 0;
1918 }

/* [previous][next][first][last][top][bottom][index][help] */