root/net/ipv4/tcp_input.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_delack_estimator
  2. tcp_rtt_estimator
  3. tcp_cache_zap
  4. get_tcp_sock
  5. bad_tcp_sequence
  6. tcp_sequence
  7. tcp_reset
  8. tcp_options
  9. tcp_conn_request
  10. tcp_window_shrunk
  11. tcp_ack
  12. tcp_fin
  13. tcp_data
  14. tcp_check_urg
  15. tcp_urg
  16. tcp_rcv

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp_input.c 1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22 
  23 #include <linux/config.h>
  24 #include <net/tcp.h>
  25 
  26 /*
  27  *      Policy code extracted so its now seperate
  28  */
  29 
  30 /*
  31  *      Called each time to estimate the delayed ack timeout. This is
  32  *      how it should be done so a fast link isnt impacted by ack delay.
  33  */
  34  
  35 extern __inline__ void tcp_delack_estimator(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
  36 {
  37         /*
  38          *      Delayed ACK time estimator.
  39          */
  40         
  41         if (sk->lrcvtime == 0) 
  42         {
  43                 sk->lrcvtime = jiffies;
  44                 sk->ato = HZ/3;
  45         }
  46         else 
  47         {
  48                 int m;
  49                 
  50                 m = jiffies - sk->lrcvtime;
  51 
  52                 sk->lrcvtime = jiffies;
  53 
  54                 if (m <= 0)
  55                         m = 1;
  56 
  57                 if (m > (sk->rtt >> 3)) 
  58                 {
  59                         sk->ato = sk->rtt >> 3;
  60                         /*
  61                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
  62                          */
  63                 }
  64                 else 
  65                 {
  66                         sk->ato = (sk->ato >> 1) + m;
  67                         /*
  68                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
  69                          */
  70                 }
  71         }
  72 }
  73 
  74 /*
  75  *      Called on frames that were known _not_ to have been
  76  *      retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
  77  *      The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
  78  */
  79  
  80 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
     /* [previous][next][first][last][top][bottom][index][help] */
  81 {
  82         long m;
  83         /*
  84          *      The following amusing code comes from Jacobson's
  85          *      article in SIGCOMM '88.  Note that rtt and mdev
  86          *      are scaled versions of rtt and mean deviation.
  87          *      This is designed to be as fast as possible 
  88          *      m stands for "measurement".
  89          */
  90         
  91         m = jiffies - oskb->when;  /* RTT */
  92         if(m<=0)
  93                 m=1;            /* IS THIS RIGHT FOR <0 ??? */
  94         m -= (sk->rtt >> 3);    /* m is now error in rtt est */
  95         sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
  96         if (m < 0)
  97                 m = -m;         /* m is now abs(error) */
  98         m -= (sk->mdev >> 2);   /* similar update on mdev */
  99         sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 100 
 101         /*
 102          *      Now update timeout.  Note that this removes any backoff.
 103          */
 104                          
 105         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 106         if (sk->rto > 120*HZ)
 107                 sk->rto = 120*HZ;
 108         if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
 109                 sk->rto = HZ/5;
 110         sk->backoff = 0;
 111 }
 112 
 113 /*
 114  *      Cached last hit socket
 115  */
 116  
 117 static volatile unsigned long   th_cache_saddr, th_cache_daddr;
 118 static volatile unsigned short  th_cache_dport, th_cache_sport;
 119 static volatile struct sock *th_cache_sk;
 120 
 121 void tcp_cache_zap(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 122 {
 123         th_cache_sk=NULL;
 124 }
 125 
 126 /*
 127  *      Find the socket, using the last hit cache if applicable. The cache is not quite
 128  *      right...
 129  */
 130 
 131 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
     /* [previous][next][first][last][top][bottom][index][help] */
 132 {
 133         struct sock * sk;
 134 
 135         sk = (struct sock *) th_cache_sk;
 136         if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
 137             sport != th_cache_sport || dport != th_cache_dport) {
 138                 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
 139                 if (sk) {
 140                         th_cache_saddr=saddr;
 141                         th_cache_daddr=daddr;
 142                         th_cache_dport=dport;
 143                         th_cache_sport=sport;
 144                         th_cache_sk=sk;
 145                 }
 146         }
 147         return sk;
 148 }
 149 
 150 /*
 151  * React to a out-of-window TCP sequence number in an incoming packet
 152  */
 153  
 154 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
 155              struct options *opt, unsigned long saddr, struct device *dev)
 156 {
 157         if (th->rst)
 158                 return;
 159 
 160         /*
 161          *      Send a reset if we get something not ours and we are
 162          *      unsynchronized. Note: We don't do anything to our end. We
 163          *      are just killing the bogus remote connection then we will
 164          *      connect again and it will work (with luck).
 165          */
 166          
 167         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
 168         {
 169                 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
 170                 return;
 171         }
 172         
 173         /*
 174          *      4.3reno machines look for these kind of acks so they can do fast
 175          *      recovery. Three identical 'old' acks lets it know that one frame has
 176          *      been lost and should be resent. Because this is before the whole window
 177          *      of data has timed out it can take one lost frame per window without
 178          *      stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
 179          *
 180          *      We also should be spotting triple bad sequences.
 181          */
 182         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
 183         return;
 184 }
 185 
 186 /*
 187  *      This functions checks to see if the tcp header is actually acceptable. 
 188  */
 189  
 190 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
     /* [previous][next][first][last][top][bottom][index][help] */
 191 {
 192         u32 end_window = sk->acked_seq + sk->window;
 193         return  /* if start is at end of window, end must be too (zero window) */
 194                 (seq == end_window && seq == end_seq) ||
 195                 /* if start is before end of window, check for interest */
 196                 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
 197 }
 198 
 199 /*
 200  *      When we get a reset we do this. This probably is a tcp_output routine
 201  *      really.
 202  */
 203 
 204 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
 205 {
 206         sk->zapped = 1;
 207         /*
 208          *      We want the right error as BSD sees it (and indeed as we do).
 209          */
 210         sk->err = ECONNRESET;
 211         if (sk->state == TCP_SYN_SENT)
 212                 sk->err = ECONNREFUSED;
 213         if (sk->state == TCP_CLOSE_WAIT)
 214                 sk->err = EPIPE;
 215 #ifdef CONFIG_TCP_RFC1337
 216         /*
 217          *      Time wait assassination protection [RFC1337]
 218          *
 219          *      This is a good idea, but causes more sockets to take time to close.
 220          *
 221          *      Ian Heavens has since shown this is an inadequate fix for the protocol
 222          *      bug in question.
 223          */
 224         if(sk->state!=TCP_TIME_WAIT)
 225         {       
 226                 tcp_set_state(sk,TCP_CLOSE);
 227                 sk->shutdown = SHUTDOWN_MASK;
 228         }
 229 #else   
 230         tcp_set_state(sk,TCP_CLOSE);
 231         sk->shutdown = SHUTDOWN_MASK;
 232 #endif  
 233         if (!sk->dead) 
 234                 sk->state_change(sk);
 235         kfree_skb(skb, FREE_READ);
 236         return(0);
 237 }
 238 
 239 
 240 /*
 241  *      Look for tcp options. Parses everything but only knows about MSS.
 242  *      This routine is always called with the packet containing the SYN.
 243  *      However it may also be called with the ack to the SYN.  So you
 244  *      can't assume this is always the SYN.  It's always called after
 245  *      we have set up sk->mtu to our own MTU.
 246  *
 247  *      We need at minimum to add PAWS support here. Possibly large windows
 248  *      as Linux gets deployed on 100Mb/sec networks.
 249  */
 250  
 251 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
 252 {
 253         unsigned char *ptr;
 254         int length=(th->doff*4)-sizeof(struct tcphdr);
 255         int mss_seen = 0;
 256     
 257         ptr = (unsigned char *)(th + 1);
 258   
 259         while(length>0)
 260         {
 261                 int opcode=*ptr++;
 262                 int opsize=*ptr++;
 263                 switch(opcode)
 264                 {
 265                         case TCPOPT_EOL:
 266                                 return;
 267                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 268                                 length--;
 269                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
 270                                 continue;
 271                         
 272                         default:
 273                                 if(opsize<=2)   /* Avoid silly options looping forever */
 274                                         return;
 275                                 switch(opcode)
 276                                 {
 277                                         case TCPOPT_MSS:
 278                                                 if(opsize==4 && th->syn)
 279                                                 {
 280                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
 281                                                         mss_seen = 1;
 282                                                 }
 283                                                 break;
 284                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
 285                                 }
 286                                 ptr+=opsize-2;
 287                                 length-=opsize;
 288                 }
 289         }
 290         if (th->syn) 
 291         {
 292                 if (! mss_seen)
 293                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
 294         }
 295 #ifdef CONFIG_INET_PCTCP
 296         sk->mss = min(sk->max_window >> 1, sk->mtu);
 297 #else    
 298         sk->mss = min(sk->max_window, sk->mtu);
 299         sk->max_unacked = 2 * sk->mss;
 300 #endif  
 301 }
 302 
 303 
 304 /*
 305  *      This routine handles a connection request.
 306  *      It should make sure we haven't already responded.
 307  *      Because of the way BSD works, we have to send a syn/ack now.
 308  *      This also means it will be harder to close a socket which is
 309  *      listening.
 310  */
 311  
 312 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
 313                  u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
 314 {
 315         struct sock *newsk;
 316         struct tcphdr *th;
 317         struct rtable *rt;
 318   
 319         th = skb->h.th;
 320 
 321         /* If the socket is dead, don't accept the connection. */
 322         if (!sk->dead) 
 323         {
 324                 sk->data_ready(sk,0);
 325         }
 326         else 
 327         {
 328                 if(sk->debug)
 329                         printk("Reset on %p: Connect on dead socket.\n",sk);
 330                 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
 331                 tcp_statistics.TcpAttemptFails++;
 332                 kfree_skb(skb, FREE_READ);
 333                 return;
 334         }
 335 
 336         /*
 337          *      Make sure we can accept more.  This will prevent a
 338          *      flurry of syns from eating up all our memory.
 339          *
 340          *      BSD does some funnies here and allows 3/2 times the
 341          *      set backlog as a fudge factor. Thats just too gross.
 342          */
 343 
 344         if (sk->ack_backlog >= sk->max_ack_backlog) 
 345         {
 346                 tcp_statistics.TcpAttemptFails++;
 347                 kfree_skb(skb, FREE_READ);
 348                 return;
 349         }
 350 
 351         /*
 352          * We need to build a new sock struct.
 353          * It is sort of bad to have a socket without an inode attached
 354          * to it, but the wake_up's will just wake up the listening socket,
 355          * and if the listening socket is destroyed before this is taken
 356          * off of the queue, this will take care of it.
 357          */
 358 
 359         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
 360         if (newsk == NULL) 
 361         {
 362                 /* just ignore the syn.  It will get retransmitted. */
 363                 tcp_statistics.TcpAttemptFails++;
 364                 kfree_skb(skb, FREE_READ);
 365                 return;
 366         }
 367 
 368         memcpy(newsk, sk, sizeof(*newsk));
 369         newsk->opt = NULL;
 370         newsk->ip_route_cache  = NULL;
 371         if (opt && opt->optlen) 
 372         {
 373                 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
 374                 if (!sk->opt) 
 375                 {
 376                         kfree_s(newsk, sizeof(struct sock));
 377                         tcp_statistics.TcpAttemptFails++;
 378                         kfree_skb(skb, FREE_READ);
 379                         return;
 380                 }
 381                 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
 382                 {
 383                         kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
 384                         kfree_s(newsk, sizeof(struct sock));
 385                         tcp_statistics.TcpAttemptFails++;
 386                         kfree_skb(skb, FREE_READ);
 387                         return;
 388                 }
 389         }
 390         skb_queue_head_init(&newsk->write_queue);
 391         skb_queue_head_init(&newsk->receive_queue);
 392         newsk->send_head = NULL;
 393         newsk->send_tail = NULL;
 394         skb_queue_head_init(&newsk->back_log);
 395         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
 396         newsk->rto = TCP_TIMEOUT_INIT;
 397         newsk->mdev = 0;
 398         newsk->max_window = 0;
 399         newsk->cong_window = 1;
 400         newsk->cong_count = 0;
 401         newsk->ssthresh = 0;
 402         newsk->backoff = 0;
 403         newsk->blog = 0;
 404         newsk->intr = 0;
 405         newsk->proc = 0;
 406         newsk->done = 0;
 407         newsk->partial = NULL;
 408         newsk->pair = NULL;
 409         newsk->wmem_alloc = 0;
 410         newsk->rmem_alloc = 0;
 411         newsk->localroute = sk->localroute;
 412 
 413         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
 414 
 415         newsk->err = 0;
 416         newsk->shutdown = 0;
 417         newsk->ack_backlog = 0;
 418         newsk->acked_seq = skb->seq+1;
 419         newsk->lastwin_seq = skb->seq+1;
 420         newsk->delay_acks = 1;
 421         newsk->copied_seq = skb->seq+1;
 422         newsk->fin_seq = skb->seq;
 423         newsk->state = TCP_SYN_RECV;
 424         newsk->timeout = 0;
 425         newsk->ip_xmit_timeout = 0;
 426         newsk->write_seq = seq; 
 427         newsk->window_seq = newsk->write_seq;
 428         newsk->rcv_ack_seq = newsk->write_seq;
 429         newsk->urg_data = 0;
 430         newsk->retransmits = 0;
 431         newsk->linger=0;
 432         newsk->destroy = 0;
 433         init_timer(&newsk->timer);
 434         newsk->timer.data = (unsigned long)newsk;
 435         newsk->timer.function = &net_timer;
 436         init_timer(&newsk->retransmit_timer);
 437         newsk->retransmit_timer.data = (unsigned long)newsk;
 438         newsk->retransmit_timer.function=&tcp_retransmit_timer;
 439         newsk->dummy_th.source = skb->h.th->dest;
 440         newsk->dummy_th.dest = skb->h.th->source;
 441         
 442         /*
 443          *      Swap these two, they are from our point of view. 
 444          */
 445          
 446         newsk->daddr = saddr;
 447         newsk->saddr = daddr;
 448         newsk->rcv_saddr = daddr;
 449 
 450         put_sock(newsk->num,newsk);
 451         newsk->acked_seq = skb->seq + 1;
 452         newsk->copied_seq = skb->seq + 1;
 453         newsk->socket = NULL;
 454 
 455         /*
 456          *      Grab the ttl and tos values and use them 
 457          */
 458 
 459         newsk->ip_ttl=sk->ip_ttl;
 460         newsk->ip_tos=skb->ip_hdr->tos;
 461 
 462         /*
 463          *      Use 512 or whatever user asked for 
 464          */
 465 
 466         /*
 467          *      Note use of sk->user_mss, since user has no direct access to newsk 
 468          */
 469 
 470         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
 471         newsk->ip_route_cache = rt;
 472         
 473         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
 474                 newsk->window_clamp = rt->rt_window;
 475         else
 476                 newsk->window_clamp = 0;
 477                 
 478         if (sk->user_mss)
 479                 newsk->mtu = sk->user_mss;
 480         else if (rt)
 481                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 482         else 
 483                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
 484 
 485         /*
 486          *      But not bigger than device MTU 
 487          */
 488 
 489         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
 490 
 491 #ifdef CONFIG_SKIP
 492         
 493         /*
 494          *      SKIP devices set their MTU to 65535. This is so they can take packets
 495          *      unfragmented to security process then fragment. They could lie to the
 496          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
 497          *      simply because the final package we want unfragmented is going to be
 498          *
 499          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
 500          */
 501          
 502         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
 503                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
 504 #endif
 505         /*
 506          *      This will min with what arrived in the packet 
 507          */
 508 
 509         tcp_options(newsk,skb->h.th);
 510         
 511         tcp_cache_zap();
 512         tcp_send_synack(newsk, sk, skb);
 513 }
 514 
 515 
 516 /*
 517  * Handle a TCP window that shrunk on us. It shouldn't happen,
 518  * but..
 519  *
 520  * We may need to move packets from the send queue
 521  * to the write queue, if the window has been shrunk on us.
 522  * The RFC says you are not allowed to shrink your window
 523  * like this, but if the other end does, you must be able
 524  * to deal with it.
 525  */
 526 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
     /* [previous][next][first][last][top][bottom][index][help] */
 527 {
 528         struct sk_buff *skb;
 529         struct sk_buff *skb2;
 530         struct sk_buff *wskb = NULL;
 531         
 532         skb2 = sk->send_head;
 533         sk->send_head = NULL;
 534         sk->send_tail = NULL;
 535 
 536         /*
 537          *      This is an artifact of a flawed concept. We want one
 538          *      queue and a smarter send routine when we send all.
 539          */
 540         cli();
 541         while (skb2 != NULL) 
 542         {
 543                 skb = skb2;
 544                 skb2 = skb->link3;
 545                 skb->link3 = NULL;
 546                 if (after(skb->end_seq, window_seq)) 
 547                 {
 548                         if (sk->packets_out > 0) 
 549                                 sk->packets_out--;
 550                         /* We may need to remove this from the dev send list. */
 551                         if (skb->next != NULL) 
 552                         {
 553                                 skb_unlink(skb);                                
 554                         }
 555                         /* Now add it to the write_queue. */
 556                         if (wskb == NULL)
 557                                 skb_queue_head(&sk->write_queue,skb);
 558                         else
 559                                 skb_append(wskb,skb);
 560                         wskb = skb;
 561                 } 
 562                 else 
 563                 {
 564                         if (sk->send_head == NULL) 
 565                         {
 566                                 sk->send_head = skb;
 567                                 sk->send_tail = skb;
 568                         }
 569                         else
 570                         {
 571                                 sk->send_tail->link3 = skb;
 572                                 sk->send_tail = skb;
 573                         }
 574                         skb->link3 = NULL;
 575                 }
 576         }
 577         sti();
 578 }
 579 
 580 
 581 /*
 582  *      This routine deals with incoming acks, but not outgoing ones.
 583  *
 584  *      This routine is totally _WRONG_. The list structuring is wrong,
 585  *      the algorithm is wrong, the code is wrong.
 586  */
 587 
 588 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
 589 {
 590         int flag = 0;
 591         u32 window_seq;
 592 
 593         /* 
 594          * 1 - there was data in packet as well as ack or new data is sent or 
 595          *     in shutdown state
 596          * 2 - data from retransmit queue was acked and removed
 597          * 4 - window shrunk or data from retransmit queue was acked and removed
 598          */
 599 
 600         if(sk->zapped)
 601                 return(1);      /* Dead, cant ack any more so why bother */
 602 
 603         /*
 604          *      We have dropped back to keepalive timeouts. Thus we have
 605          *      no retransmits pending.
 606          */
 607          
 608         if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
 609                 sk->retransmits = 0;
 610 
 611         /*
 612          *      If the ack is newer than sent or older than previous acks
 613          *      then we can probably ignore it.
 614          */
 615          
 616         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
 617                 goto uninteresting_ack;
 618 
 619         /*
 620          *      If there is data set flag 1
 621          */
 622          
 623         if (len != th->doff*4) 
 624                 flag |= 1;
 625 
 626         /*
 627          *      Have we discovered a larger window
 628          */
 629         window_seq = ntohs(th->window);
 630         if (window_seq > sk->max_window) 
 631         {
 632                 sk->max_window = window_seq;
 633 #ifdef CONFIG_INET_PCTCP
 634                 /* Hack because we don't send partial packets to non SWS
 635                    handling hosts */
 636                 sk->mss = min(window_seq>>1, sk->mtu);
 637 #else
 638                 sk->mss = min(window_seq, sk->mtu);
 639 #endif  
 640         }
 641         window_seq += ack;
 642 
 643         /*
 644          *      See if our window has been shrunk. 
 645          */
 646         if (after(sk->window_seq, window_seq)) {
 647                 flag |= 4;
 648                 tcp_window_shrunk(sk, window_seq);
 649         }
 650 
 651         /*
 652          *      Update the right hand window edge of the host
 653          */
 654         sk->window_seq = window_seq;
 655 
 656         /*
 657          *      Pipe has emptied
 658          */      
 659         if (sk->send_tail == NULL || sk->send_head == NULL) 
 660         {
 661                 sk->send_head = NULL;
 662                 sk->send_tail = NULL;
 663                 sk->packets_out= 0;
 664         }
 665 
 666         /*
 667          *      We don't want too many packets out there. 
 668          */
 669          
 670         if (sk->ip_xmit_timeout == TIME_WRITE && 
 671                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
 672         {
 673                 
 674                 /* 
 675                  * This is Jacobson's slow start and congestion avoidance. 
 676                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
 677                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
 678                  * counter and increment it once every cwnd times.  It's possible
 679                  * that this should be done only if sk->retransmits == 0.  I'm
 680                  * interpreting "new data is acked" as including data that has
 681                  * been retransmitted but is just now being acked.
 682                  */
 683                 if (sk->cong_window < sk->ssthresh)  
 684                         /* 
 685                          *      In "safe" area, increase
 686                          */
 687                         sk->cong_window++;
 688                 else 
 689                 {
 690                         /*
 691                          *      In dangerous area, increase slowly.  In theory this is
 692                          *      sk->cong_window += 1 / sk->cong_window
 693                          */
 694                         if (sk->cong_count >= sk->cong_window) 
 695                         {
 696                                 sk->cong_window++;
 697                                 sk->cong_count = 0;
 698                         }
 699                         else 
 700                                 sk->cong_count++;
 701                 }
 702         }
 703 
 704         /*
 705          *      Remember the highest ack received.
 706          */
 707          
 708         sk->rcv_ack_seq = ack;
 709         
 710         /*
 711          *      We passed data and got it acked, remove any soft error
 712          *      log. Something worked...
 713          */
 714          
 715         sk->err_soft = 0;
 716 
 717         /*
 718          *      If this ack opens up a zero window, clear backoff.  It was
 719          *      being used to time the probes, and is probably far higher than
 720          *      it needs to be for normal retransmission.
 721          */
 722 
 723         if (sk->ip_xmit_timeout == TIME_PROBE0) 
 724         {
 725                 sk->retransmits = 0;    /* Our probe was answered */
 726                 
 727                 /*
 728                  *      Was it a usable window open ?
 729                  */
 730                  
 731                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
 732                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
 733                 {
 734                         sk->backoff = 0;
 735                         
 736                         /*
 737                          *      Recompute rto from rtt.  this eliminates any backoff.
 738                          */
 739 
 740                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 741                         if (sk->rto > 120*HZ)
 742                                 sk->rto = 120*HZ;
 743                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
 744                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
 745                                                    .2 of a second is going to need huge windows (SIGH) */
 746                         sk->rto = HZ/5;
 747                 }
 748         }
 749 
 750         /* 
 751          *      See if we can take anything off of the retransmit queue.
 752          */
 753    
 754         while(sk->send_head != NULL) 
 755         {
 756                 /* Check for a bug. */
 757                 if (sk->send_head->link3 &&
 758                     after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) 
 759                         printk("INET: tcp.c: *** bug send_list out of order.\n");
 760                         
 761                 /*
 762                  *      If our packet is before the ack sequence we can
 763                  *      discard it as it's confirmed to have arrived the other end.
 764                  */
 765                  
 766                 if (before(sk->send_head->end_seq, ack+1)) 
 767                 {
 768                         struct sk_buff *oskb;   
 769                         if (sk->retransmits) 
 770                         {       
 771                                 /*
 772                                  *      We were retransmitting.  don't count this in RTT est 
 773                                  */
 774                                 flag |= 2;
 775 
 776                                 /*
 777                                  * even though we've gotten an ack, we're still
 778                                  * retransmitting as long as we're sending from
 779                                  * the retransmit queue.  Keeping retransmits non-zero
 780                                  * prevents us from getting new data interspersed with
 781                                  * retransmissions.
 782                                  */
 783 
 784                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
 785                                         sk->retransmits = 1;
 786                                 else
 787                                         sk->retransmits = 0;
 788                         }
 789                         /*
 790                          * Note that we only reset backoff and rto in the
 791                          * rtt recomputation code.  And that doesn't happen
 792                          * if there were retransmissions in effect.  So the
 793                          * first new packet after the retransmissions is
 794                          * sent with the backoff still in effect.  Not until
 795                          * we get an ack from a non-retransmitted packet do
 796                          * we reset the backoff and rto.  This allows us to deal
 797                          * with a situation where the network delay has increased
 798                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 799                          */
 800 
 801                         /*
 802                          *      We have one less packet out there. 
 803                          */
 804                          
 805                         if (sk->packets_out > 0) 
 806                                 sk->packets_out --;
 807 
 808                         oskb = sk->send_head;
 809 
 810                         if (!(flag&2))  /* Not retransmitting */
 811                                 tcp_rtt_estimator(sk,oskb);
 812                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
 813                                            In this case as we just set it up */
 814                         cli();
 815                         oskb = sk->send_head;
 816                         IS_SKB(oskb);
 817                         sk->send_head = oskb->link3;
 818                         if (sk->send_head == NULL) 
 819                         {
 820                                 sk->send_tail = NULL;
 821                         }
 822 
 823                 /*
 824                  *      We may need to remove this from the dev send list. 
 825                  */
 826 
 827                         if (oskb->next)
 828                                 skb_unlink(oskb);
 829                         sti();
 830                         kfree_skb(oskb, FREE_WRITE); /* write. */
 831                         if (!sk->dead)
 832                                 sk->write_space(sk);
 833                 }
 834                 else
 835                 {
 836                         break;
 837                 }
 838         }
 839 
 840         /*
 841          * XXX someone ought to look at this too.. at the moment, if skb_peek()
 842          * returns non-NULL, we complete ignore the timer stuff in the else
 843          * clause.  We ought to organize the code so that else clause can
 844          * (should) be executed regardless, possibly moving the PROBE timer
 845          * reset over.  The skb_peek() thing should only move stuff to the
 846          * write queue, NOT also manage the timer functions.
 847          */
 848 
 849         /*
 850          * Maybe we can take some stuff off of the write queue,
 851          * and put it onto the xmit queue.
 852          */
 853         if (skb_peek(&sk->write_queue) != NULL) 
 854         {
 855                 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
 856                         (sk->retransmits == 0 || 
 857                          sk->ip_xmit_timeout != TIME_WRITE ||
 858                          before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
 859                         && sk->packets_out < sk->cong_window) 
 860                 {
 861                         /*
 862                          *      Add more data to the send queue.
 863                          */
 864                         flag |= 1;
 865                         tcp_write_xmit(sk);
 866                 }
 867                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
 868                         sk->send_head == NULL &&
 869                         sk->ack_backlog == 0 &&
 870                         sk->state != TCP_TIME_WAIT) 
 871                 {
 872                         /*
 873                          *      Data to queue but no room.
 874                          */
 875                         tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
 876                 }               
 877         }
 878         else
 879         {
 880                 /*
 881                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
 882                  * from TCP_CLOSE we don't do anything
 883                  *
 884                  * from anything else, if there is write data (or fin) pending,
 885                  * we use a TIME_WRITE timeout, else if keepalive we reset to
 886                  * a KEEPALIVE timeout, else we delete the timer.
 887                  *
 888                  * We do not set flag for nominal write data, otherwise we may
 889                  * force a state where we start to write itsy bitsy tidbits
 890                  * of data.
 891                  */
 892 
 893                 switch(sk->state) {
 894                 case TCP_TIME_WAIT:
 895                         /*
 896                          * keep us in TIME_WAIT until we stop getting packets,
 897                          * reset the timeout.
 898                          */
 899                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 900                         break;
 901                 case TCP_CLOSE:
 902                         /*
 903                          * don't touch the timer.
 904                          */
 905                         break;
 906                 default:
 907                         /*
 908                          *      Must check send_head, write_queue, and ack_backlog
 909                          *      to determine which timeout to use.
 910                          */
 911                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
 912                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 913                         } else if (sk->keepopen) {
 914                                 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 915                         } else {
 916                                 del_timer(&sk->retransmit_timer);
 917                                 sk->ip_xmit_timeout = 0;
 918                         }
 919                         break;
 920                 }
 921         }
 922 
 923         /*
 924          *      We have nothing queued but space to send. Send any partial
 925          *      packets immediately (end of Nagle rule application).
 926          */
 927          
 928         if (sk->packets_out == 0 && sk->partial != NULL &&
 929                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
 930         {
 931                 flag |= 1;
 932                 tcp_send_partial(sk);
 933         }
 934 
 935         /*
 936          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
 937          * we are now waiting for an acknowledge to our FIN.  The other end is
 938          * already in TIME_WAIT.
 939          *
 940          * Move to TCP_CLOSE on success.
 941          */
 942 
 943         if (sk->state == TCP_LAST_ACK) 
 944         {
 945                 if (!sk->dead)
 946                         sk->state_change(sk);
 947                 if(sk->debug)
 948                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
 949                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
 950                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
 951                 {
 952                         flag |= 1;
 953                         sk->shutdown = SHUTDOWN_MASK;
 954                         tcp_set_state(sk,TCP_CLOSE);
 955                         return 1;
 956                 }
 957         }
 958 
 959         /*
 960          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
 961          *
 962          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
 963          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
 964          */
 965 
 966         if (sk->state == TCP_FIN_WAIT1) 
 967         {
 968 
 969                 if (!sk->dead) 
 970                         sk->state_change(sk);
 971                 if (sk->rcv_ack_seq == sk->write_seq) 
 972                 {
 973                         flag |= 1;
 974                         sk->shutdown |= SEND_SHUTDOWN;
 975                         tcp_set_state(sk, TCP_FIN_WAIT2);
 976                 }
 977         }
 978 
 979         /*
 980          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
 981          *
 982          *      Move to TIME_WAIT
 983          */
 984 
 985         if (sk->state == TCP_CLOSING) 
 986         {
 987 
 988                 if (!sk->dead) 
 989                         sk->state_change(sk);
 990                 if (sk->rcv_ack_seq == sk->write_seq) 
 991                 {
 992                         flag |= 1;
 993                         tcp_time_wait(sk);
 994                 }
 995         }
 996         
 997         /*
 998          *      Final ack of a three way shake 
 999          */
1000          
1001         if(sk->state==TCP_SYN_RECV)
1002         {
1003                 tcp_set_state(sk, TCP_ESTABLISHED);
1004                 tcp_options(sk,th);
1005                 sk->dummy_th.dest=th->source;
1006                 sk->copied_seq = sk->acked_seq;
1007                 if(!sk->dead)
1008                         sk->state_change(sk);
1009                 if(sk->max_window==0)
1010                 {
1011                         sk->max_window=32;      /* Sanity check */
1012                         sk->mss=min(sk->max_window,sk->mtu);
1013                 }
1014         }
1015         
1016         /*
1017          * I make no guarantees about the first clause in the following
1018          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
1019          * what conditions "!flag" would be true.  However I think the rest
1020          * of the conditions would prevent that from causing any
1021          * unnecessary retransmission. 
1022          *   Clearly if the first packet has expired it should be 
1023          * retransmitted.  The other alternative, "flag&2 && retransmits", is
1024          * harder to explain:  You have to look carefully at how and when the
1025          * timer is set and with what timeout.  The most recent transmission always
1026          * sets the timer.  So in general if the most recent thing has timed
1027          * out, everything before it has as well.  So we want to go ahead and
1028          * retransmit some more.  If we didn't explicitly test for this
1029          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1030          * would not be true.  If you look at the pattern of timing, you can
1031          * show that rto is increased fast enough that the next packet would
1032          * almost never be retransmitted immediately.  Then you'd end up
1033          * waiting for a timeout to send each packet on the retransmission
1034          * queue.  With my implementation of the Karn sampling algorithm,
1035          * the timeout would double each time.  The net result is that it would
1036          * take a hideous amount of time to recover from a single dropped packet.
1037          * It's possible that there should also be a test for TIME_WRITE, but
1038          * I think as long as "send_head != NULL" and "retransmit" is on, we've
1039          * got to be in real retransmission mode.
1040          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
1041          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1042          * As long as no further losses occur, this seems reasonable.
1043          */
1044         
1045         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1046                (((flag&2) && sk->retransmits) ||
1047                (sk->send_head->when + sk->rto < jiffies))) 
1048         {
1049                 if(sk->send_head->when + sk->rto < jiffies)
1050                         tcp_retransmit(sk,0);   
1051                 else
1052                 {
1053                         tcp_do_retransmit(sk, 1);
1054                         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1055                 }
1056         }
1057 
1058         return 1;
1059 
1060 uninteresting_ack:
1061         if(sk->debug)
1062                 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1063                         
1064         /*
1065          *      Keepalive processing.
1066          */
1067                  
1068         if (after(ack, sk->sent_seq)) 
1069         {
1070                 return 0;
1071         }
1072                 
1073         /*
1074          *      Restart the keepalive timer.
1075          */
1076                  
1077         if (sk->keepopen) 
1078         {
1079                 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1080                         tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1081         }
1082         return 1;
1083 }
1084 
1085 
1086 /*
1087  *      Process the FIN bit. This now behaves as it is supposed to work
1088  *      and the FIN takes effect when it is validly part of sequence
1089  *      space. Not before when we get holes.
1090  *
1091  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1092  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1093  *      TIME-WAIT)
1094  *
1095  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1096  *      close and we go into CLOSING (and later onto TIME-WAIT)
1097  *
1098  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1099  *
1100  */
1101  
1102 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
1103 {
1104         sk->fin_seq = skb->end_seq;
1105 
1106         if (!sk->dead) 
1107         {
1108                 sk->state_change(sk);
1109                 sock_wake_async(sk->socket, 1);
1110         }
1111 
1112         switch(sk->state) 
1113         {
1114                 case TCP_SYN_RECV:
1115                 case TCP_SYN_SENT:
1116                 case TCP_ESTABLISHED:
1117                         /*
1118                          * move to CLOSE_WAIT, tcp_data() already handled
1119                          * sending the ack.
1120                          */
1121                         tcp_set_state(sk,TCP_CLOSE_WAIT);
1122                         if (th->rst)
1123                                 sk->shutdown = SHUTDOWN_MASK;
1124                         break;
1125 
1126                 case TCP_CLOSE_WAIT:
1127                 case TCP_CLOSING:
1128                         /*
1129                          * received a retransmission of the FIN, do
1130                          * nothing.
1131                          */
1132                         break;
1133                 case TCP_TIME_WAIT:
1134                         /*
1135                          * received a retransmission of the FIN,
1136                          * restart the TIME_WAIT timer.
1137                          */
1138                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1139                         return(0);
1140                 case TCP_FIN_WAIT1:
1141                         /*
1142                          * This case occurs when a simultaneous close
1143                          * happens, we must ack the received FIN and
1144                          * enter the CLOSING state.
1145                          *
1146                          * This causes a WRITE timeout, which will either
1147                          * move on to TIME_WAIT when we timeout, or resend
1148                          * the FIN properly (maybe we get rid of that annoying
1149                          * FIN lost hang). The TIME_WRITE code is already correct
1150                          * for handling this timeout.
1151                          */
1152 
1153                         if(sk->ip_xmit_timeout != TIME_WRITE)
1154                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1155                         tcp_set_state(sk,TCP_CLOSING);
1156                         break;
1157                 case TCP_FIN_WAIT2:
1158                         /*
1159                          * received a FIN -- send ACK and enter TIME_WAIT
1160                          */
1161                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1162                         sk->shutdown|=SHUTDOWN_MASK;
1163                         tcp_set_state(sk,TCP_TIME_WAIT);
1164                         break;
1165                 case TCP_CLOSE:
1166                         /*
1167                          * already in CLOSE
1168                          */
1169                         break;
1170                 default:
1171                         tcp_set_state(sk,TCP_LAST_ACK);
1172         
1173                         /* Start the timers. */
1174                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1175                         return(0);
1176         }
1177 
1178         return(0);
1179 }
1180 
1181 
1182 
1183 /*
1184  *      This routine handles the data.  If there is room in the buffer,
1185  *      it will be have already been moved into it.  If there is no
1186  *      room, then we will just have to discard the packet.
1187  */
1188 
1189 static int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
1190          unsigned long saddr, unsigned short len)
1191 {
1192         struct sk_buff *skb1, *skb2;
1193         struct tcphdr *th;
1194         int dup_dumped=0;
1195         u32 new_seq, shut_seq;
1196 
1197         th = skb->h.th;
1198         skb_pull(skb,th->doff*4);
1199         skb_trim(skb,len-(th->doff*4));
1200 
1201         /*
1202          *      The bytes in the receive read/assembly queue has increased. Needed for the
1203          *      low memory discard algorithm 
1204          */
1205            
1206         sk->bytes_rcv += skb->len;
1207         
1208         if (skb->len == 0 && !th->fin) 
1209         {
1210                 /* 
1211                  *      Don't want to keep passing ack's back and forth. 
1212                  *      (someone sent us dataless, boring frame)
1213                  */
1214                 if (!th->ack)
1215                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1216                 kfree_skb(skb, FREE_READ);
1217                 return(0);
1218         }
1219         
1220         /*
1221          *      We no longer have anyone receiving data on this connection.
1222          */
1223 
1224 #ifndef TCP_DONT_RST_SHUTDOWN            
1225 
1226         if(sk->shutdown & RCV_SHUTDOWN)
1227         {
1228                 /*
1229                  *      FIXME: BSD has some magic to avoid sending resets to
1230                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
1231                  *      BSD stacks still have broken keepalives so we want to
1232                  *      cope with it.
1233                  */
1234 
1235                 if(skb->len)    /* We don't care if it's just an ack or
1236                                    a keepalive/window probe */
1237                 {
1238                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
1239                         
1240                         /* Do this the way 4.4BSD treats it. Not what I'd
1241                            regard as the meaning of the spec but it's what BSD
1242                            does and clearly they know everything 8) */
1243 
1244                         /*
1245                          *      This is valid because of two things
1246                          *
1247                          *      a) The way tcp_data behaves at the bottom.
1248                          *      b) A fin takes effect when read not when received.
1249                          */
1250                          
1251                         shut_seq = sk->acked_seq+1;     /* Last byte */
1252                         
1253                         if(after(new_seq,shut_seq))
1254                         {
1255                                 if(sk->debug)
1256                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1257                                                 sk, new_seq, shut_seq, sk->blog);
1258                                 if(sk->dead)
1259                                 {
1260                                         sk->acked_seq = new_seq + th->fin;
1261                                         tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1262                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1263                                         tcp_statistics.TcpEstabResets++;
1264                                         sk->err = EPIPE;
1265                                         sk->error_report(sk);
1266                                         sk->shutdown = SHUTDOWN_MASK;
1267                                         tcp_set_state(sk,TCP_CLOSE);
1268                                         kfree_skb(skb, FREE_READ);
1269                                         return 0;
1270                                 }
1271                         }
1272                 }
1273         }
1274 
1275 #endif
1276 
1277         /*
1278          *      Now we have to walk the chain, and figure out where this one
1279          *      goes into it.  This is set up so that the last packet we received
1280          *      will be the first one we look at, that way if everything comes
1281          *      in order, there will be no performance loss, and if they come
1282          *      out of order we will be able to fit things in nicely.
1283          *
1284          *      [AC: This is wrong. We should assume in order first and then walk
1285          *       forwards from the first hole based upon real traffic patterns.]
1286          *      
1287          */
1288 
1289         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
1290         {
1291                 skb_queue_head(&sk->receive_queue,skb);
1292                 skb1= NULL;
1293         } 
1294         else
1295         {
1296                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
1297                 {
1298                         if(sk->debug)
1299                         {
1300                                 printk("skb1=%p :", skb1);
1301                                 printk("skb1->seq = %d: ", skb1->seq);
1302                                 printk("skb->seq = %d\n",skb->seq);
1303                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
1304                                                 sk->acked_seq);
1305                         }
1306                         
1307                         /*
1308                          *      Optimisation: Duplicate frame or extension of previous frame from
1309                          *      same sequence point (lost ack case).
1310                          *      The frame contains duplicate data or replaces a previous frame
1311                          *      discard the previous frame (safe as sk->users is set) and put
1312                          *      the new one in its place.
1313                          */
1314                          
1315                         if (skb->seq==skb1->seq && skb->len>=skb1->len)
1316                         {
1317                                 skb_append(skb1,skb);
1318                                 skb_unlink(skb1);
1319                                 kfree_skb(skb1,FREE_READ);
1320                                 dup_dumped=1;
1321                                 skb1=NULL;
1322                                 break;
1323                         }
1324                         
1325                         /*
1326                          *      Found where it fits
1327                          */
1328                          
1329                         if (after(skb->seq+1, skb1->seq))
1330                         {
1331                                 skb_append(skb1,skb);
1332                                 break;
1333                         }
1334                         
1335                         /*
1336                          *      See if we've hit the start. If so insert.
1337                          */
1338                         if (skb1 == skb_peek(&sk->receive_queue))
1339                         {
1340                                 skb_queue_head(&sk->receive_queue, skb);
1341                                 break;
1342                         }
1343                 }
1344         }
1345 
1346         /*
1347          *      Figure out what the ack value for this frame is
1348          */
1349          
1350         if (before(sk->acked_seq, sk->copied_seq)) 
1351         {
1352                 printk("*** tcp.c:tcp_data bug acked < copied\n");
1353                 sk->acked_seq = sk->copied_seq;
1354         }
1355 
1356         /*
1357          *      Now figure out if we can ack anything. This is very messy because we really want two
1358          *      receive queues, a completed and an assembly queue. We also want only one transmit
1359          *      queue.
1360          */
1361 
1362         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1)) 
1363         {
1364                 if (before(skb->seq, sk->acked_seq+1)) 
1365                 {
1366 
1367                         if (after(skb->end_seq, sk->acked_seq)) 
1368                                 sk->acked_seq = skb->end_seq;
1369 
1370                         skb->acked = 1;
1371 
1372                         /*
1373                          *      When we ack the fin, we do the FIN 
1374                          *      processing.
1375                          */
1376 
1377                         if (skb->h.th->fin) 
1378                         {
1379                                 tcp_fin(skb,sk,skb->h.th);
1380                         }
1381           
1382                         for(skb2 = skb->next;
1383                             skb2 != (struct sk_buff *)&sk->receive_queue;
1384                             skb2 = skb2->next) 
1385                         {
1386                                 if (before(skb2->seq, sk->acked_seq+1)) 
1387                                 {
1388                                         if (after(skb2->end_seq, sk->acked_seq))
1389                                                 sk->acked_seq = skb2->end_seq;
1390 
1391                                         skb2->acked = 1;
1392                                         /*
1393                                          *      When we ack the fin, we do
1394                                          *      the fin handling.
1395                                          */
1396                                         if (skb2->h.th->fin) 
1397                                         {
1398                                                 tcp_fin(skb,sk,skb->h.th);
1399                                         }
1400 
1401                                         /*
1402                                          *      Force an immediate ack.
1403                                          */
1404                                          
1405                                         sk->ack_backlog = sk->max_ack_backlog;
1406                                 }
1407                                 else
1408                                 {
1409                                         break;
1410                                 }
1411                         }
1412 
1413                         /*
1414                          *      This also takes care of updating the window.
1415                          *      This if statement needs to be simplified.
1416                          *
1417                          *      rules for delaying an ack:
1418                          *      - delay time <= 0.5 HZ
1419                          *      - we don't have a window update to send
1420                          *      - must send at least every 2 full sized packets
1421                          */
1422                         if (!sk->delay_acks ||
1423                             sk->ack_backlog >= sk->max_ack_backlog || 
1424                             sk->bytes_rcv > sk->max_unacked || th->fin ||
1425                             sk->ato > HZ/2 ||
1426                             tcp_raise_window(sk)) {
1427         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
1428                         }
1429                         else 
1430                         {
1431                                 sk->ack_backlog++;
1432                                 
1433                                 if(sk->debug)                           
1434                                         printk("Ack queued.\n");
1435                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);
1436                         }
1437                 }
1438         }
1439 
1440         /*
1441          *      If we've missed a packet, send an ack.
1442          *      Also start a timer to send another.
1443          */
1444          
1445         if (!skb->acked) 
1446         {
1447         
1448         /*
1449          *      This is important.  If we don't have much room left,
1450          *      we need to throw out a few packets so we have a good
1451          *      window.  Note that mtu is used, not mss, because mss is really
1452          *      for the send side.  He could be sending us stuff as large as mtu.
1453          */
1454                  
1455                 while (sock_rspace(sk) < sk->mtu) 
1456                 {
1457                         skb1 = skb_peek(&sk->receive_queue);
1458                         if (skb1 == NULL) 
1459                         {
1460                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
1461                                 break;
1462                         }
1463 
1464                         /*
1465                          *      Don't throw out something that has been acked. 
1466                          */
1467                  
1468                         if (skb1->acked) 
1469                         {
1470                                 break;
1471                         }
1472                 
1473                         skb_unlink(skb1);
1474                         kfree_skb(skb1, FREE_READ);
1475                 }
1476                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1477                 sk->ack_backlog++;
1478                 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
1479         }
1480         else
1481         {
1482                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1483         }
1484 
1485         /*
1486          *      Now tell the user we may have some data. 
1487          */
1488          
1489         if (!sk->dead) 
1490         {
1491                 if(sk->debug)
1492                         printk("Data wakeup.\n");
1493                 sk->data_ready(sk,0);
1494         } 
1495         return(0);
1496 }
1497 
1498 
1499 /*
1500  *      This routine is only called when we have urgent data
1501  *      signalled. Its the 'slow' part of tcp_urg. It could be
1502  *      moved inline now as tcp_urg is only called from one
1503  *      place. We handle URGent data wrong. We have to - as
1504  *      BSD still doesn't use the correction from RFC961.
1505  */
1506  
1507 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
1508 {
1509         u32 ptr = ntohs(th->urg_ptr);
1510 
1511         if (ptr)
1512                 ptr--;
1513         ptr += ntohl(th->seq);
1514 
1515         /* ignore urgent data that we've already seen and read */
1516         if (after(sk->copied_seq, ptr))
1517                 return;
1518 
1519         /* do we already have a newer (or duplicate) urgent pointer? */
1520         if (sk->urg_data && !after(ptr, sk->urg_seq))
1521                 return;
1522 
1523         /* tell the world about our new urgent pointer */
1524         if (sk->proc != 0) {
1525                 if (sk->proc > 0) {
1526                         kill_proc(sk->proc, SIGURG, 1);
1527                 } else {
1528                         kill_pg(-sk->proc, SIGURG, 1);
1529                 }
1530         }
1531         sk->urg_data = URG_NOTYET;
1532         sk->urg_seq = ptr;
1533 }
1534 
1535 /*
1536  *      This is the 'fast' part of urgent handling.
1537  */
1538  
1539 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
     /* [previous][next][first][last][top][bottom][index][help] */
1540 {
1541         /*
1542          *      Check if we get a new urgent pointer - normally not 
1543          */
1544          
1545         if (th->urg)
1546                 tcp_check_urg(sk,th);
1547 
1548         /*
1549          *      Do we wait for any urgent data? - normally not
1550          */
1551          
1552         if (sk->urg_data == URG_NOTYET) {
1553                 u32 ptr;
1554 
1555                 /*
1556                  *      Is the urgent pointer pointing into this packet? 
1557                  */      
1558                 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1559                 if (ptr < len) {
1560                         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1561                         if (!sk->dead)
1562                                 sk->data_ready(sk,0);
1563                 }
1564         }
1565 }
1566 
1567 
1568 /*
1569  *      A TCP packet has arrived.
1570  *              skb->h.raw is the TCP header.
1571  */
1572  
1573 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
1574         __u32 daddr, unsigned short len,
1575         __u32 saddr, int redo, struct inet_protocol * protocol)
1576 {
1577         struct tcphdr *th;
1578         struct sock *sk;
1579         int syn_ok=0;
1580 
1581         /*
1582          * "redo" is 1 if we have already seen this skb but couldn't
1583          * use it at that time (the socket was locked).  In that case
1584          * we have already done a lot of the work (looked up the socket
1585          * etc).
1586          */
1587         th = skb->h.th;
1588         sk = skb->sk;
1589         if (!redo) {
1590                 tcp_statistics.TcpInSegs++;
1591                 if (skb->pkt_type!=PACKET_HOST)
1592                         goto discard_it;
1593 
1594                 /*
1595                  *      Pull up the IP header.
1596                  */
1597         
1598                 skb_pull(skb, skb->h.raw-skb->data);
1599 
1600                 /*
1601                  *      Try to use the device checksum if provided.
1602                  */
1603                 switch (skb->ip_summed) 
1604                 {
1605                         case CHECKSUM_NONE:
1606                                 skb->csum = csum_partial((char *)th, len, 0);
1607                         case CHECKSUM_HW:
1608                                 if (tcp_check(th, len, saddr, daddr, skb->csum))
1609                                         goto discard_it;
1610                         default:
1611                                 /* CHECKSUM_UNNECESSARY */
1612                 }
1613                 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1614                 if (!sk)
1615                         goto no_tcp_socket;
1616                 skb->sk = sk;
1617                 skb->seq = ntohl(th->seq);
1618                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1619                 skb->ack_seq = ntohl(th->ack_seq);
1620 
1621                 skb->acked = 0;
1622                 skb->used = 0;
1623                 skb->free = 1;
1624                 skb->saddr = daddr;
1625                 skb->daddr = saddr;
1626 
1627                 /* We may need to add it to the backlog here. */
1628                 if (sk->users) 
1629                 {
1630                         skb_queue_tail(&sk->back_log, skb);
1631                         return(0);
1632                 }
1633         }
1634 
1635         /*
1636          *      If this socket has got a reset it's to all intents and purposes 
1637          *      really dead. Count closed sockets as dead.
1638          *
1639          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1640          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
1641          *      exist so should cause resets as if the port was unreachable.
1642          */
1643 
1644         if (sk->zapped || sk->state==TCP_CLOSE)
1645                 goto no_tcp_socket;
1646 
1647         if (!sk->prot) 
1648         {
1649                 printk("IMPOSSIBLE 3\n");
1650                 return(0);
1651         }
1652 
1653 
1654         /*
1655          *      Charge the memory to the socket. 
1656          */
1657          
1658         skb->sk=sk;
1659         sk->rmem_alloc += skb->truesize;
1660         
1661         /*
1662          *      We should now do header prediction.
1663          */
1664          
1665         /*
1666          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1667          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1668          *      compatibility. We also set up variables more thoroughly [Karn notes in the
1669          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1670          */
1671 
1672         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
1673         {
1674         
1675                 /*
1676                  *      Now deal with unusual cases.
1677                  */
1678          
1679                 if(sk->state==TCP_LISTEN)
1680                 {
1681                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
1682                                 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1683 
1684                         /*
1685                          *      We don't care for RST, and non SYN are absorbed (old segments)
1686                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1687                          *      netmask on a running connection it can go broadcast. Even Sun's have
1688                          *      this problem so I'm ignoring it 
1689                          */
1690                            
1691                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1692                         {
1693                                 kfree_skb(skb, FREE_READ);
1694                                 return 0;
1695                         }
1696                 
1697                         /*      
1698                          *      Guess we need to make a new socket up 
1699                          */
1700                 
1701                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1702                 
1703                         /*
1704                          *      Now we have several options: In theory there is nothing else
1705                          *      in the frame. KA9Q has an option to send data with the syn,
1706                          *      BSD accepts data with the syn up to the [to be] advertised window
1707                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
1708                          *      it, that fits the spec precisely and avoids incompatibilities. It
1709                          *      would be nice in future to drop through and process the data.
1710                          *
1711                          *      Now TTCP is starting to use we ought to queue this data.
1712                          */
1713                          
1714                         return 0;
1715                 }
1716         
1717                 /* 
1718                  *      Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1719                  *      then its a new connection
1720                  */
1721                  
1722                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1723                 {
1724                         kfree_skb(skb, FREE_READ);
1725                         return 0;
1726                 }
1727                 
1728                 /*
1729                  *      SYN sent means we have to look for a suitable ack and either reset
1730                  *      for bad matches or go to connected. The SYN_SENT case is unusual and should
1731                  *      not be in line code. [AC]
1732                  */
1733            
1734                 if(sk->state==TCP_SYN_SENT)
1735                 {
1736                         /* Crossed SYN or previous junk segment */
1737                         if(th->ack)
1738                         {
1739                                 /* We got an ack, but it's not a good ack */
1740                                 if(!tcp_ack(sk,th,skb->ack_seq,len))
1741                                 {
1742                                         /* Reset the ack - its an ack from a 
1743                                            different connection  [ th->rst is checked in tcp_send_reset()] */
1744                                         tcp_statistics.TcpAttemptFails++;
1745                                         tcp_send_reset(daddr, saddr, th,
1746                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1747                                         kfree_skb(skb, FREE_READ);
1748                                         return(0);
1749                                 }
1750                                 if(th->rst)
1751                                         return tcp_reset(sk,skb);
1752                                 if(!th->syn)
1753                                 {
1754                                         /* A valid ack from a different connection
1755                                            start. Shouldn't happen but cover it */
1756                                         tcp_statistics.TcpAttemptFails++;
1757                                         tcp_send_reset(daddr, saddr, th,
1758                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1759                                         kfree_skb(skb, FREE_READ);
1760                                         return 0;
1761                                 }
1762                                 /*
1763                                  *      Ok.. it's good. Set up sequence numbers and
1764                                  *      move to established.
1765                                  */
1766                                 syn_ok=1;       /* Don't reset this connection for the syn */
1767                                 sk->acked_seq = skb->seq+1;
1768                                 sk->lastwin_seq = skb->seq+1;
1769                                 sk->fin_seq = skb->seq;
1770                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1771                                 tcp_set_state(sk, TCP_ESTABLISHED);
1772                                 tcp_options(sk,th);
1773                                 sk->dummy_th.dest=th->source;
1774                                 sk->copied_seq = sk->acked_seq;
1775                                 if(!sk->dead)
1776                                 {
1777                                         sk->state_change(sk);
1778                                         sock_wake_async(sk->socket, 0);
1779                                 }
1780                                 if(sk->max_window==0)
1781                                 {
1782                                         sk->max_window = 32;
1783                                         sk->mss = min(sk->max_window, sk->mtu);
1784                                 }
1785                         }
1786                         else
1787                         {
1788                                 /* See if SYN's cross. Drop if boring */
1789                                 if(th->syn && !th->rst)
1790                                 {
1791                                         /* Crossed SYN's are fine - but talking to
1792                                            yourself is right out... */
1793                                         if(sk->saddr==saddr && sk->daddr==daddr &&
1794                                                 sk->dummy_th.source==th->source &&
1795                                                 sk->dummy_th.dest==th->dest)
1796                                         {
1797                                                 tcp_statistics.TcpAttemptFails++;
1798                                                 return tcp_reset(sk,skb);
1799                                         }
1800                                         tcp_set_state(sk,TCP_SYN_RECV);
1801                                         
1802                                         /*
1803                                          *      FIXME:
1804                                          *      Must send SYN|ACK here
1805                                          */
1806                                 }               
1807                                 /* Discard junk segment */
1808                                 kfree_skb(skb, FREE_READ);
1809                                 return 0;
1810                         }
1811                         /*
1812                          *      SYN_RECV with data maybe.. drop through
1813                          */
1814                         goto rfc_step6;
1815                 }
1816 
1817         /*
1818          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1819          *      a more complex suggestion for fixing these reuse issues in RFC1644
1820          *      but not yet ready for general use. Also see RFC1379.
1821          *
1822          *      Note the funny way we go back to the top of this function for
1823          *      this case ("goto try_next_socket").  That also takes care of
1824          *      checking "sk->users" for the new socket as well as doing all
1825          *      the normal tests on the packet.
1826          */
1827         
1828 #define BSD_TIME_WAIT
1829 #ifdef BSD_TIME_WAIT
1830                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
1831                         after(skb->seq, sk->acked_seq) && !th->rst)
1832                 {
1833                         u32 seq = sk->write_seq;
1834                         if(sk->debug)
1835                                 printk("Doing a BSD time wait\n");
1836                         tcp_statistics.TcpEstabResets++;           
1837                         sk->rmem_alloc -= skb->truesize;
1838                         skb->sk = NULL;
1839                         sk->err=ECONNRESET;
1840                         tcp_set_state(sk, TCP_CLOSE);
1841                         sk->shutdown = SHUTDOWN_MASK;
1842                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1843                         /* this is not really correct: we should check sk->users */
1844                         if (sk && sk->state==TCP_LISTEN)
1845                         {
1846                                 skb->sk = sk;
1847                                 sk->rmem_alloc += skb->truesize;
1848                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1849                                 return 0;
1850                         }
1851                         kfree_skb(skb, FREE_READ);
1852                         return 0;
1853                 }
1854 #endif  
1855         }
1856 
1857         /*
1858          *      We are now in normal data flow (see the step list in the RFC)
1859          *      Note most of these are inline now. I'll inline the lot when
1860          *      I have time to test it hard and look at what gcc outputs 
1861          */
1862         
1863         if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1864         {
1865                 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1866                 kfree_skb(skb, FREE_READ);
1867                 return 0;
1868         }
1869 
1870         if(th->rst)
1871                 return tcp_reset(sk,skb);
1872         
1873         /*
1874          *      !syn_ok is effectively the state test in RFC793.
1875          */
1876          
1877         if(th->syn && !syn_ok)
1878         {
1879                 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1880                 return tcp_reset(sk,skb);       
1881         }
1882 
1883         tcp_delack_estimator(sk);
1884         
1885         /*
1886          *      Process the ACK
1887          */
1888          
1889 
1890         if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1891         {
1892                 /*
1893                  *      Our three way handshake failed.
1894                  */
1895                  
1896                 if(sk->state==TCP_SYN_RECV)
1897                 {
1898                         tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1899                 }
1900                 kfree_skb(skb, FREE_READ);
1901                 return 0;
1902         }
1903         
1904 rfc_step6:              /* I'll clean this up later */
1905 
1906         /*
1907          *      If the accepted buffer put us over our queue size we
1908          *      now drop it (we must process the ack first to avoid
1909          *      deadlock cases).
1910          */
1911          
1912         if (sk->rmem_alloc  >= sk->rcvbuf) 
1913         {
1914                 kfree_skb(skb, FREE_READ);
1915                 return(0);
1916         }
1917 
1918 
1919         /*
1920          *      Process urgent data
1921          */
1922                 
1923         tcp_urg(sk, th, len);
1924         
1925         /*
1926          *      Process the encapsulated data
1927          */
1928         
1929         if(tcp_data(skb,sk, saddr, len))
1930                 kfree_skb(skb, FREE_READ);
1931 
1932         /*
1933          *      And done
1934          */     
1935         
1936         return 0;
1937 
1938 no_tcp_socket:
1939         /*
1940          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1941          */
1942         tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1943 
1944 discard_it:
1945         /*
1946          *      Discard frame
1947          */
1948         skb->sk = NULL;
1949         kfree_skb(skb, FREE_READ);
1950         return 0;
1951 }

/* [previous][next][first][last][top][bottom][index][help] */