root/net/ipv4/tcp_input.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_cache_zap
  2. get_tcp_sock
  3. bad_tcp_sequence
  4. tcp_sequence
  5. tcp_reset
  6. tcp_options
  7. tcp_conn_request
  8. tcp_ack
  9. tcp_fin
  10. tcp_data
  11. tcp_check_urg
  12. tcp_urg
  13. tcp_rcv

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp_input.c 1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22 
  23 #include <linux/config.h>
  24 #include <net/tcp.h>
  25 
  26 /*
  27  *      Cached last hit socket
  28  */
  29  
  30 static volatile unsigned long   th_cache_saddr,th_cache_daddr;
  31 static volatile unsigned short  th_cache_dport, th_cache_sport;
  32 static volatile struct sock *th_cache_sk;
  33 
  34 void tcp_cache_zap(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  35 {
  36         th_cache_sk=NULL;
  37 }
  38 
  39 /*
  40  *      Find the socket, using the last hit cache if applicable.
  41  */
  42 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
     /* [previous][next][first][last][top][bottom][index][help] */
  43 {
  44         struct sock * sk;
  45 
  46         sk = (struct sock *) th_cache_sk;
  47         if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
  48             sport != th_cache_sport || dport != th_cache_dport) {
  49                 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
  50                 if (sk) {
  51                         th_cache_saddr=saddr;
  52                         th_cache_daddr=daddr;
  53                         th_cache_dport=dport;
  54                         th_cache_sport=sport;
  55                         th_cache_sk=sk;
  56                 }
  57         }
  58         return sk;
  59 }
  60 
  61 /*
  62  * React to a out-of-window TCP sequence number in an incoming packet
  63  */
  64 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
  65              struct options *opt, unsigned long saddr, struct device *dev)
  66 {
  67         if (th->rst)
  68                 return;
  69 
  70         /*
  71          *      Send a reset if we get something not ours and we are
  72          *      unsynchronized. Note: We don't do anything to our end. We
  73          *      are just killing the bogus remote connection then we will
  74          *      connect again and it will work (with luck).
  75          */
  76          
  77         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
  78         {
  79                 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
  80                 return;
  81         }
  82 
  83         /* Try to resync things. */
  84         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
  85         return;
  86 }
  87 
  88 /*
  89  *      This functions checks to see if the tcp header is actually acceptable. 
  90  */
  91  
  92 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
     /* [previous][next][first][last][top][bottom][index][help] */
  93 {
  94         u32 end_window = sk->acked_seq + sk->window;
  95         return  /* if start is at end of window, end must be too (zero window) */
  96                 (seq == end_window && seq == end_seq) ||
  97                 /* if start is before end of window, check for interest */
  98                 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
  99 }
 100 
 101 /*
 102  *      When we get a reset we do this.
 103  */
 104 
 105 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
 106 {
 107         sk->zapped = 1;
 108         sk->err = ECONNRESET;
 109         if (sk->state == TCP_SYN_SENT)
 110                 sk->err = ECONNREFUSED;
 111         if (sk->state == TCP_CLOSE_WAIT)
 112                 sk->err = EPIPE;
 113 #ifdef TCP_DO_RFC1337           
 114         /*
 115          *      Time wait assassination protection [RFC1337]
 116          */
 117         if(sk->state!=TCP_TIME_WAIT)
 118         {       
 119                 tcp_set_state(sk,TCP_CLOSE);
 120                 sk->shutdown = SHUTDOWN_MASK;
 121         }
 122 #else   
 123         tcp_set_state(sk,TCP_CLOSE);
 124         sk->shutdown = SHUTDOWN_MASK;
 125 #endif  
 126         if (!sk->dead) 
 127                 sk->state_change(sk);
 128         kfree_skb(skb, FREE_READ);
 129         release_sock(sk);
 130         return(0);
 131 }
 132 
 133 
 134 /*
 135  *      Look for tcp options. Parses everything but only knows about MSS.
 136  *      This routine is always called with the packet containing the SYN.
 137  *      However it may also be called with the ack to the SYN.  So you
 138  *      can't assume this is always the SYN.  It's always called after
 139  *      we have set up sk->mtu to our own MTU.
 140  *
 141  *      We need at minimum to add PAWS support here. Possibly large windows
 142  *      as Linux gets deployed on 100Mb/sec networks.
 143  */
 144  
 145 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
 146 {
 147         unsigned char *ptr;
 148         int length=(th->doff*4)-sizeof(struct tcphdr);
 149         int mss_seen = 0;
 150     
 151         ptr = (unsigned char *)(th + 1);
 152   
 153         while(length>0)
 154         {
 155                 int opcode=*ptr++;
 156                 int opsize=*ptr++;
 157                 switch(opcode)
 158                 {
 159                         case TCPOPT_EOL:
 160                                 return;
 161                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 162                                 length--;
 163                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
 164                                 continue;
 165                         
 166                         default:
 167                                 if(opsize<=2)   /* Avoid silly options looping forever */
 168                                         return;
 169                                 switch(opcode)
 170                                 {
 171                                         case TCPOPT_MSS:
 172                                                 if(opsize==4 && th->syn)
 173                                                 {
 174                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
 175                                                         mss_seen = 1;
 176                                                 }
 177                                                 break;
 178                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
 179                                 }
 180                                 ptr+=opsize-2;
 181                                 length-=opsize;
 182                 }
 183         }
 184         if (th->syn) 
 185         {
 186                 if (! mss_seen)
 187                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
 188         }
 189 #ifdef CONFIG_INET_PCTCP
 190         sk->mss = min(sk->max_window >> 1, sk->mtu);
 191 #else    
 192         sk->mss = min(sk->max_window, sk->mtu);
 193         sk->max_unacked = 2 * sk->mss;
 194 #endif  
 195 }
 196 
 197 
 198 /*
 199  *      This routine handles a connection request.
 200  *      It should make sure we haven't already responded.
 201  *      Because of the way BSD works, we have to send a syn/ack now.
 202  *      This also means it will be harder to close a socket which is
 203  *      listening.
 204  */
 205  
 206 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
 207                  u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
 208 {
 209         struct sock *newsk;
 210         struct tcphdr *th;
 211         struct rtable *rt;
 212   
 213         th = skb->h.th;
 214 
 215         /* If the socket is dead, don't accept the connection. */
 216         if (!sk->dead) 
 217         {
 218                 sk->data_ready(sk,0);
 219         }
 220         else 
 221         {
 222                 if(sk->debug)
 223                         printk("Reset on %p: Connect on dead socket.\n",sk);
 224                 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
 225                 tcp_statistics.TcpAttemptFails++;
 226                 kfree_skb(skb, FREE_READ);
 227                 return;
 228         }
 229 
 230         /*
 231          * Make sure we can accept more.  This will prevent a
 232          * flurry of syns from eating up all our memory.
 233          */
 234 
 235         if (sk->ack_backlog >= sk->max_ack_backlog) 
 236         {
 237                 tcp_statistics.TcpAttemptFails++;
 238                 kfree_skb(skb, FREE_READ);
 239                 return;
 240         }
 241 
 242         /*
 243          * We need to build a new sock struct.
 244          * It is sort of bad to have a socket without an inode attached
 245          * to it, but the wake_up's will just wake up the listening socket,
 246          * and if the listening socket is destroyed before this is taken
 247          * off of the queue, this will take care of it.
 248          */
 249 
 250         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
 251         if (newsk == NULL) 
 252         {
 253                 /* just ignore the syn.  It will get retransmitted. */
 254                 tcp_statistics.TcpAttemptFails++;
 255                 kfree_skb(skb, FREE_READ);
 256                 return;
 257         }
 258 
 259         memcpy(newsk, sk, sizeof(*newsk));
 260         newsk->opt = NULL;
 261         newsk->ip_route_cache  = NULL;
 262         if (opt && opt->optlen) {
 263           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
 264           if (!sk->opt) {
 265                 kfree_s(newsk, sizeof(struct sock));
 266                 tcp_statistics.TcpAttemptFails++;
 267                 kfree_skb(skb, FREE_READ);
 268                 return;
 269           }
 270           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
 271                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
 272                 kfree_s(newsk, sizeof(struct sock));
 273                 tcp_statistics.TcpAttemptFails++;
 274                 kfree_skb(skb, FREE_READ);
 275                 return;
 276           }
 277         }
 278         skb_queue_head_init(&newsk->write_queue);
 279         skb_queue_head_init(&newsk->receive_queue);
 280         newsk->send_head = NULL;
 281         newsk->send_tail = NULL;
 282         skb_queue_head_init(&newsk->back_log);
 283         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
 284         newsk->rto = TCP_TIMEOUT_INIT;
 285         newsk->mdev = 0;
 286         newsk->max_window = 0;
 287         newsk->cong_window = 1;
 288         newsk->cong_count = 0;
 289         newsk->ssthresh = 0;
 290         newsk->backoff = 0;
 291         newsk->blog = 0;
 292         newsk->intr = 0;
 293         newsk->proc = 0;
 294         newsk->done = 0;
 295         newsk->partial = NULL;
 296         newsk->pair = NULL;
 297         newsk->wmem_alloc = 0;
 298         newsk->rmem_alloc = 0;
 299         newsk->localroute = sk->localroute;
 300 
 301         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
 302 
 303         newsk->err = 0;
 304         newsk->shutdown = 0;
 305         newsk->ack_backlog = 0;
 306         newsk->acked_seq = skb->seq+1;
 307         newsk->lastwin_seq = skb->seq+1;
 308         newsk->delay_acks = 1;
 309         newsk->copied_seq = skb->seq+1;
 310         newsk->fin_seq = skb->seq;
 311         newsk->state = TCP_SYN_RECV;
 312         newsk->timeout = 0;
 313         newsk->ip_xmit_timeout = 0;
 314         newsk->write_seq = seq; 
 315         newsk->window_seq = newsk->write_seq;
 316         newsk->rcv_ack_seq = newsk->write_seq;
 317         newsk->urg_data = 0;
 318         newsk->retransmits = 0;
 319         newsk->linger=0;
 320         newsk->destroy = 0;
 321         init_timer(&newsk->timer);
 322         newsk->timer.data = (unsigned long)newsk;
 323         newsk->timer.function = &net_timer;
 324         init_timer(&newsk->retransmit_timer);
 325         newsk->retransmit_timer.data = (unsigned long)newsk;
 326         newsk->retransmit_timer.function=&tcp_retransmit_timer;
 327         newsk->dummy_th.source = skb->h.th->dest;
 328         newsk->dummy_th.dest = skb->h.th->source;
 329         
 330         /*
 331          *      Swap these two, they are from our point of view. 
 332          */
 333          
 334         newsk->daddr = saddr;
 335         newsk->saddr = daddr;
 336         newsk->rcv_saddr = daddr;
 337 
 338         put_sock(newsk->num,newsk);
 339         newsk->dummy_th.res1 = 0;
 340         newsk->dummy_th.doff = 6;
 341         newsk->dummy_th.fin = 0;
 342         newsk->dummy_th.syn = 0;
 343         newsk->dummy_th.rst = 0;        
 344         newsk->dummy_th.psh = 0;
 345         newsk->dummy_th.ack = 0;
 346         newsk->dummy_th.urg = 0;
 347         newsk->dummy_th.res2 = 0;
 348         newsk->acked_seq = skb->seq + 1;
 349         newsk->copied_seq = skb->seq + 1;
 350         newsk->socket = NULL;
 351 
 352         /*
 353          *      Grab the ttl and tos values and use them 
 354          */
 355 
 356         newsk->ip_ttl=sk->ip_ttl;
 357         newsk->ip_tos=skb->ip_hdr->tos;
 358 
 359         /*
 360          *      Use 512 or whatever user asked for 
 361          */
 362 
 363         /*
 364          *      Note use of sk->user_mss, since user has no direct access to newsk 
 365          */
 366 
 367         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
 368         newsk->ip_route_cache = rt;
 369         
 370         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
 371                 newsk->window_clamp = rt->rt_window;
 372         else
 373                 newsk->window_clamp = 0;
 374                 
 375         if (sk->user_mss)
 376                 newsk->mtu = sk->user_mss;
 377         else if (rt)
 378                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 379         else 
 380                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
 381 
 382         /*
 383          *      But not bigger than device MTU 
 384          */
 385 
 386         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
 387 
 388 #ifdef CONFIG_SKIP
 389         
 390         /*
 391          *      SKIP devices set their MTU to 65535. This is so they can take packets
 392          *      unfragmented to security process then fragment. They could lie to the
 393          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
 394          *      simply because the final package we want unfragmented is going to be
 395          *
 396          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
 397          */
 398          
 399         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
 400                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
 401 #endif
 402         /*
 403          *      This will min with what arrived in the packet 
 404          */
 405 
 406         tcp_options(newsk,skb->h.th);
 407         
 408         tcp_cache_zap();
 409         tcp_send_synack(newsk, sk, skb);
 410 }
 411 
 412 /*
 413  *      This routine deals with incoming acks, but not outgoing ones.
 414  */
 415 
 416 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
 417 {
 418         int flag = 0;
 419         unsigned window;
 420 
 421         /* 
 422          * 1 - there was data in packet as well as ack or new data is sent or 
 423          *     in shutdown state
 424          * 2 - data from retransmit queue was acked and removed
 425          * 4 - window shrunk or data from retransmit queue was acked and removed
 426          */
 427 
 428         if(sk->zapped)
 429                 return(1);      /* Dead, cant ack any more so why bother */
 430 
 431         /*
 432          *      Have we discovered a larger window
 433          */
 434          
 435         window = ntohs(th->window);
 436 
 437         if (window > sk->max_window) 
 438         {
 439                 sk->max_window = window;
 440 #ifdef CONFIG_INET_PCTCP
 441                 /* Hack because we don't send partial packets to non SWS
 442                    handling hosts */
 443                 sk->mss = min(window>>1, sk->mtu);
 444 #else
 445                 sk->mss = min(window, sk->mtu);
 446 #endif  
 447         }
 448 
 449         /*
 450          *      We have dropped back to keepalive timeouts. Thus we have
 451          *      no retransmits pending.
 452          */
 453          
 454         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
 455                 sk->retransmits = 0;
 456 
 457         /*
 458          *      If the ack is newer than sent or older than previous acks
 459          *      then we can probably ignore it.
 460          */
 461          
 462         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
 463         {
 464                 if(sk->debug)
 465                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
 466                         
 467                 /*
 468                  *      Keepalive processing.
 469                  */
 470                  
 471                 if (after(ack, sk->sent_seq)) 
 472                 {
 473                         return(0);
 474                 }
 475                 
 476                 /*
 477                  *      Restart the keepalive timer.
 478                  */
 479                  
 480                 if (sk->keepopen) 
 481                 {
 482                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
 483                                 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 484                 }
 485                 return(1);
 486         }
 487 
 488         /*
 489          *      If there is data set flag 1
 490          */
 491          
 492         if (len != th->doff*4) 
 493                 flag |= 1;
 494 
 495         /*
 496          *      See if our window has been shrunk. 
 497          */
 498 
 499         if (after(sk->window_seq, ack+window)) 
 500         {
 501                 /*
 502                  * We may need to move packets from the send queue
 503                  * to the write queue, if the window has been shrunk on us.
 504                  * The RFC says you are not allowed to shrink your window
 505                  * like this, but if the other end does, you must be able
 506                  * to deal with it.
 507                  */
 508                 struct sk_buff *skb;
 509                 struct sk_buff *skb2;
 510                 struct sk_buff *wskb = NULL;
 511         
 512                 skb2 = sk->send_head;
 513                 sk->send_head = NULL;
 514                 sk->send_tail = NULL;
 515         
 516                 /*
 517                  *      This is an artifact of a flawed concept. We want one
 518                  *      queue and a smarter send routine when we send all.
 519                  */
 520         
 521                 flag |= 4;      /* Window changed */
 522         
 523                 sk->window_seq = ack + window;
 524                 cli();
 525                 while (skb2 != NULL) 
 526                 {
 527                         skb = skb2;
 528                         skb2 = skb->link3;
 529                         skb->link3 = NULL;
 530                         if (after(skb->end_seq, sk->window_seq)) 
 531                         {
 532                                 if (sk->packets_out > 0) 
 533                                         sk->packets_out--;
 534                                 /* We may need to remove this from the dev send list. */
 535                                 if (skb->next != NULL) 
 536                                 {
 537                                         skb_unlink(skb);                                
 538                                 }
 539                                 /* Now add it to the write_queue. */
 540                                 if (wskb == NULL)
 541                                         skb_queue_head(&sk->write_queue,skb);
 542                                 else
 543                                         skb_append(wskb,skb);
 544                                 wskb = skb;
 545                         } 
 546                         else 
 547                         {
 548                                 if (sk->send_head == NULL) 
 549                                 {
 550                                         sk->send_head = skb;
 551                                         sk->send_tail = skb;
 552                                 }
 553                                 else
 554                                 {
 555                                         sk->send_tail->link3 = skb;
 556                                         sk->send_tail = skb;
 557                                 }
 558                                 skb->link3 = NULL;
 559                         }
 560                 }
 561                 sti();
 562         }
 563 
 564         /*
 565          *      Pipe has emptied
 566          */
 567          
 568         if (sk->send_tail == NULL || sk->send_head == NULL) 
 569         {
 570                 sk->send_head = NULL;
 571                 sk->send_tail = NULL;
 572                 sk->packets_out= 0;
 573         }
 574 
 575         /*
 576          *      Update the right hand window edge of the host
 577          */
 578          
 579         sk->window_seq = ack + window;
 580 
 581         /*
 582          *      We don't want too many packets out there. 
 583          */
 584          
 585         if (sk->ip_xmit_timeout == TIME_WRITE && 
 586                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
 587         {
 588                 /* 
 589                  * This is Jacobson's slow start and congestion avoidance. 
 590                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
 591                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
 592                  * counter and increment it once every cwnd times.  It's possible
 593                  * that this should be done only if sk->retransmits == 0.  I'm
 594                  * interpreting "new data is acked" as including data that has
 595                  * been retransmitted but is just now being acked.
 596                  */
 597                 if (sk->cong_window < sk->ssthresh)  
 598                         /* 
 599                          *      In "safe" area, increase
 600                          */
 601                         sk->cong_window++;
 602                 else 
 603                 {
 604                         /*
 605                          *      In dangerous area, increase slowly.  In theory this is
 606                          *      sk->cong_window += 1 / sk->cong_window
 607                          */
 608                         if (sk->cong_count >= sk->cong_window) 
 609                         {
 610                                 sk->cong_window++;
 611                                 sk->cong_count = 0;
 612                         }
 613                         else 
 614                                 sk->cong_count++;
 615                 }
 616         }
 617 
 618         /*
 619          *      Remember the highest ack received.
 620          */
 621          
 622         sk->rcv_ack_seq = ack;
 623         
 624         /*
 625          *      We passed data and got it acked, remove any soft error
 626          *      log. Something worked...
 627          */
 628          
 629         sk->err_soft = 0;
 630 
 631         /*
 632          *      If this ack opens up a zero window, clear backoff.  It was
 633          *      being used to time the probes, and is probably far higher than
 634          *      it needs to be for normal retransmission.
 635          */
 636 
 637         if (sk->ip_xmit_timeout == TIME_PROBE0) 
 638         {
 639                 sk->retransmits = 0;    /* Our probe was answered */
 640                 
 641                 /*
 642                  *      Was it a usable window open ?
 643                  */
 644                  
 645                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
 646                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
 647                 {
 648                         sk->backoff = 0;
 649                         
 650                         /*
 651                          *      Recompute rto from rtt.  this eliminates any backoff.
 652                          */
 653 
 654                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 655                         if (sk->rto > 120*HZ)
 656                                 sk->rto = 120*HZ;
 657                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
 658                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
 659                                                    .2 of a second is going to need huge windows (SIGH) */
 660                         sk->rto = HZ/5;
 661                 }
 662         }
 663 
 664         /* 
 665          *      See if we can take anything off of the retransmit queue.
 666          */
 667    
 668         while(sk->send_head != NULL) 
 669         {
 670                 /* Check for a bug. */
 671                 if (sk->send_head->link3 &&
 672                     after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) 
 673                         printk("INET: tcp.c: *** bug send_list out of order.\n");
 674                         
 675                 /*
 676                  *      If our packet is before the ack sequence we can
 677                  *      discard it as it's confirmed to have arrived the other end.
 678                  */
 679                  
 680                 if (before(sk->send_head->end_seq, ack+1)) 
 681                 {
 682                         struct sk_buff *oskb;   
 683                         if (sk->retransmits) 
 684                         {       
 685                                 /*
 686                                  *      We were retransmitting.  don't count this in RTT est 
 687                                  */
 688                                 flag |= 2;
 689 
 690                                 /*
 691                                  * even though we've gotten an ack, we're still
 692                                  * retransmitting as long as we're sending from
 693                                  * the retransmit queue.  Keeping retransmits non-zero
 694                                  * prevents us from getting new data interspersed with
 695                                  * retransmissions.
 696                                  */
 697 
 698                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
 699                                         sk->retransmits = 1;
 700                                 else
 701                                         sk->retransmits = 0;
 702                         }
 703                         /*
 704                          * Note that we only reset backoff and rto in the
 705                          * rtt recomputation code.  And that doesn't happen
 706                          * if there were retransmissions in effect.  So the
 707                          * first new packet after the retransmissions is
 708                          * sent with the backoff still in effect.  Not until
 709                          * we get an ack from a non-retransmitted packet do
 710                          * we reset the backoff and rto.  This allows us to deal
 711                          * with a situation where the network delay has increased
 712                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 713                          */
 714 
 715                         /*
 716                          *      We have one less packet out there. 
 717                          */
 718                          
 719                         if (sk->packets_out > 0) 
 720                                 sk->packets_out --;
 721 
 722                         oskb = sk->send_head;
 723 
 724                         if (!(flag&2))  /* Not retransmitting */
 725                         {
 726                                 long m;
 727         
 728                                 /*
 729                                  *      The following amusing code comes from Jacobson's
 730                                  *      article in SIGCOMM '88.  Note that rtt and mdev
 731                                  *      are scaled versions of rtt and mean deviation.
 732                                  *      This is designed to be as fast as possible 
 733                                  *      m stands for "measurement".
 734                                  */
 735         
 736                                 m = jiffies - oskb->when;  /* RTT */
 737                                 if(m<=0)
 738                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
 739                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
 740                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
 741                                 if (m < 0)
 742                                         m = -m;         /* m is now abs(error) */
 743                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
 744                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 745         
 746                                 /*
 747                                  *      Now update timeout.  Note that this removes any backoff.
 748                                  */
 749                          
 750                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
 751                                 if (sk->rto > 120*HZ)
 752                                         sk->rto = 120*HZ;
 753                                 if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
 754                                         sk->rto = HZ/5;
 755                                 sk->backoff = 0;
 756                         }
 757                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
 758                                            In this case as we just set it up */
 759                         cli();
 760                         oskb = sk->send_head;
 761                         IS_SKB(oskb);
 762                         sk->send_head = oskb->link3;
 763                         if (sk->send_head == NULL) 
 764                         {
 765                                 sk->send_tail = NULL;
 766                         }
 767 
 768                 /*
 769                  *      We may need to remove this from the dev send list. 
 770                  */
 771 
 772                         if (oskb->next)
 773                                 skb_unlink(oskb);
 774                         sti();
 775                         kfree_skb(oskb, FREE_WRITE); /* write. */
 776                         if (!sk->dead)
 777                                 sk->write_space(sk);
 778                 }
 779                 else
 780                 {
 781                         break;
 782                 }
 783         }
 784 
 785         /*
 786          * XXX someone ought to look at this too.. at the moment, if skb_peek()
 787          * returns non-NULL, we complete ignore the timer stuff in the else
 788          * clause.  We ought to organize the code so that else clause can
 789          * (should) be executed regardless, possibly moving the PROBE timer
 790          * reset over.  The skb_peek() thing should only move stuff to the
 791          * write queue, NOT also manage the timer functions.
 792          */
 793 
 794         /*
 795          * Maybe we can take some stuff off of the write queue,
 796          * and put it onto the xmit queue.
 797          */
 798         if (skb_peek(&sk->write_queue) != NULL) 
 799         {
 800                 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
 801                         (sk->retransmits == 0 || 
 802                          sk->ip_xmit_timeout != TIME_WRITE ||
 803                          before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
 804                         && sk->packets_out < sk->cong_window) 
 805                 {
 806                         /*
 807                          *      Add more data to the send queue.
 808                          */
 809                         flag |= 1;
 810                         tcp_write_xmit(sk);
 811                 }
 812                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
 813                         sk->send_head == NULL &&
 814                         sk->ack_backlog == 0 &&
 815                         sk->state != TCP_TIME_WAIT) 
 816                 {
 817                         /*
 818                          *      Data to queue but no room.
 819                          */
 820                         tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
 821                 }               
 822         }
 823         else
 824         {
 825                 /*
 826                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
 827                  * from TCP_CLOSE we don't do anything
 828                  *
 829                  * from anything else, if there is write data (or fin) pending,
 830                  * we use a TIME_WRITE timeout, else if keepalive we reset to
 831                  * a KEEPALIVE timeout, else we delete the timer.
 832                  *
 833                  * We do not set flag for nominal write data, otherwise we may
 834                  * force a state where we start to write itsy bitsy tidbits
 835                  * of data.
 836                  */
 837 
 838                 switch(sk->state) {
 839                 case TCP_TIME_WAIT:
 840                         /*
 841                          * keep us in TIME_WAIT until we stop getting packets,
 842                          * reset the timeout.
 843                          */
 844                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 845                         break;
 846                 case TCP_CLOSE:
 847                         /*
 848                          * don't touch the timer.
 849                          */
 850                         break;
 851                 default:
 852                         /*
 853                          *      Must check send_head, write_queue, and ack_backlog
 854                          *      to determine which timeout to use.
 855                          */
 856                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
 857                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 858                         } else if (sk->keepopen) {
 859                                 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 860                         } else {
 861                                 del_timer(&sk->retransmit_timer);
 862                                 sk->ip_xmit_timeout = 0;
 863                         }
 864                         break;
 865                 }
 866         }
 867 
 868         /*
 869          *      We have nothing queued but space to send. Send any partial
 870          *      packets immediately (end of Nagle rule application).
 871          */
 872          
 873         if (sk->packets_out == 0 && sk->partial != NULL &&
 874                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
 875         {
 876                 flag |= 1;
 877                 tcp_send_partial(sk);
 878         }
 879 
 880         /*
 881          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
 882          * we are now waiting for an acknowledge to our FIN.  The other end is
 883          * already in TIME_WAIT.
 884          *
 885          * Move to TCP_CLOSE on success.
 886          */
 887 
 888         if (sk->state == TCP_LAST_ACK) 
 889         {
 890                 if (!sk->dead)
 891                         sk->state_change(sk);
 892                 if(sk->debug)
 893                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
 894                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
 895                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
 896                 {
 897                         flag |= 1;
 898                         sk->shutdown = SHUTDOWN_MASK;
 899                         tcp_set_state(sk,TCP_CLOSE);
 900                         return 1;
 901                 }
 902         }
 903 
 904         /*
 905          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
 906          *
 907          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
 908          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
 909          */
 910 
 911         if (sk->state == TCP_FIN_WAIT1) 
 912         {
 913 
 914                 if (!sk->dead) 
 915                         sk->state_change(sk);
 916                 if (sk->rcv_ack_seq == sk->write_seq) 
 917                 {
 918                         flag |= 1;
 919                         sk->shutdown |= SEND_SHUTDOWN;
 920                         tcp_set_state(sk, TCP_FIN_WAIT2);
 921                 }
 922         }
 923 
 924         /*
 925          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
 926          *
 927          *      Move to TIME_WAIT
 928          */
 929 
 930         if (sk->state == TCP_CLOSING) 
 931         {
 932 
 933                 if (!sk->dead) 
 934                         sk->state_change(sk);
 935                 if (sk->rcv_ack_seq == sk->write_seq) 
 936                 {
 937                         flag |= 1;
 938                         tcp_time_wait(sk);
 939                 }
 940         }
 941         
 942         /*
 943          *      Final ack of a three way shake 
 944          */
 945          
 946         if(sk->state==TCP_SYN_RECV)
 947         {
 948                 tcp_set_state(sk, TCP_ESTABLISHED);
 949                 tcp_options(sk,th);
 950                 sk->dummy_th.dest=th->source;
 951                 sk->copied_seq = sk->acked_seq;
 952                 if(!sk->dead)
 953                         sk->state_change(sk);
 954                 if(sk->max_window==0)
 955                 {
 956                         sk->max_window=32;      /* Sanity check */
 957                         sk->mss=min(sk->max_window,sk->mtu);
 958                 }
 959         }
 960         
 961         /*
 962          * I make no guarantees about the first clause in the following
 963          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
 964          * what conditions "!flag" would be true.  However I think the rest
 965          * of the conditions would prevent that from causing any
 966          * unnecessary retransmission. 
 967          *   Clearly if the first packet has expired it should be 
 968          * retransmitted.  The other alternative, "flag&2 && retransmits", is
 969          * harder to explain:  You have to look carefully at how and when the
 970          * timer is set and with what timeout.  The most recent transmission always
 971          * sets the timer.  So in general if the most recent thing has timed
 972          * out, everything before it has as well.  So we want to go ahead and
 973          * retransmit some more.  If we didn't explicitly test for this
 974          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
 975          * would not be true.  If you look at the pattern of timing, you can
 976          * show that rto is increased fast enough that the next packet would
 977          * almost never be retransmitted immediately.  Then you'd end up
 978          * waiting for a timeout to send each packet on the retransmission
 979          * queue.  With my implementation of the Karn sampling algorithm,
 980          * the timeout would double each time.  The net result is that it would
 981          * take a hideous amount of time to recover from a single dropped packet.
 982          * It's possible that there should also be a test for TIME_WRITE, but
 983          * I think as long as "send_head != NULL" and "retransmit" is on, we've
 984          * got to be in real retransmission mode.
 985          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
 986          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
 987          * As long as no further losses occur, this seems reasonable.
 988          */
 989         
 990         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
 991                (((flag&2) && sk->retransmits) ||
 992                (sk->send_head->when + sk->rto < jiffies))) 
 993         {
 994                 if(sk->send_head->when + sk->rto < jiffies)
 995                         tcp_retransmit(sk,0);   
 996                 else
 997                 {
 998                         tcp_do_retransmit(sk, 1);
 999                         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1000                 }
1001         }
1002 
1003         return(1);
1004 }
1005 
1006 
1007 /*
1008  *      Process the FIN bit. This now behaves as it is supposed to work
1009  *      and the FIN takes effect when it is validly part of sequence
1010  *      space. Not before when we get holes.
1011  *
1012  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1013  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1014  *      TIME-WAIT)
1015  *
1016  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1017  *      close and we go into CLOSING (and later onto TIME-WAIT)
1018  *
1019  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1020  *
1021  */
1022  
1023 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
1024 {
1025         sk->fin_seq = skb->end_seq;
1026 
1027         if (!sk->dead) 
1028         {
1029                 sk->state_change(sk);
1030                 sock_wake_async(sk->socket, 1);
1031         }
1032 
1033         switch(sk->state) 
1034         {
1035                 case TCP_SYN_RECV:
1036                 case TCP_SYN_SENT:
1037                 case TCP_ESTABLISHED:
1038                         /*
1039                          * move to CLOSE_WAIT, tcp_data() already handled
1040                          * sending the ack.
1041                          */
1042                         tcp_set_state(sk,TCP_CLOSE_WAIT);
1043                         if (th->rst)
1044                                 sk->shutdown = SHUTDOWN_MASK;
1045                         break;
1046 
1047                 case TCP_CLOSE_WAIT:
1048                 case TCP_CLOSING:
1049                         /*
1050                          * received a retransmission of the FIN, do
1051                          * nothing.
1052                          */
1053                         break;
1054                 case TCP_TIME_WAIT:
1055                         /*
1056                          * received a retransmission of the FIN,
1057                          * restart the TIME_WAIT timer.
1058                          */
1059                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1060                         return(0);
1061                 case TCP_FIN_WAIT1:
1062                         /*
1063                          * This case occurs when a simultaneous close
1064                          * happens, we must ack the received FIN and
1065                          * enter the CLOSING state.
1066                          *
1067                          * This causes a WRITE timeout, which will either
1068                          * move on to TIME_WAIT when we timeout, or resend
1069                          * the FIN properly (maybe we get rid of that annoying
1070                          * FIN lost hang). The TIME_WRITE code is already correct
1071                          * for handling this timeout.
1072                          */
1073 
1074                         if(sk->ip_xmit_timeout != TIME_WRITE)
1075                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1076                         tcp_set_state(sk,TCP_CLOSING);
1077                         break;
1078                 case TCP_FIN_WAIT2:
1079                         /*
1080                          * received a FIN -- send ACK and enter TIME_WAIT
1081                          */
1082                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1083                         sk->shutdown|=SHUTDOWN_MASK;
1084                         tcp_set_state(sk,TCP_TIME_WAIT);
1085                         break;
1086                 case TCP_CLOSE:
1087                         /*
1088                          * already in CLOSE
1089                          */
1090                         break;
1091                 default:
1092                         tcp_set_state(sk,TCP_LAST_ACK);
1093         
1094                         /* Start the timers. */
1095                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1096                         return(0);
1097         }
1098 
1099         return(0);
1100 }
1101 
1102 
1103 
1104 /*
1105  *      This routine handles the data.  If there is room in the buffer,
1106  *      it will be have already been moved into it.  If there is no
1107  *      room, then we will just have to discard the packet.
1108  */
1109 
1110 static int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
1111          unsigned long saddr, unsigned short len)
1112 {
1113         struct sk_buff *skb1, *skb2;
1114         struct tcphdr *th;
1115         int dup_dumped=0;
1116         u32 new_seq, shut_seq;
1117 
1118         th = skb->h.th;
1119         skb_pull(skb,th->doff*4);
1120         skb_trim(skb,len-(th->doff*4));
1121 
1122         /*
1123          *      The bytes in the receive read/assembly queue has increased. Needed for the
1124          *      low memory discard algorithm 
1125          */
1126            
1127         sk->bytes_rcv += skb->len;
1128         
1129         if (skb->len == 0 && !th->fin) 
1130         {
1131                 /* 
1132                  *      Don't want to keep passing ack's back and forth. 
1133                  *      (someone sent us dataless, boring frame)
1134                  */
1135                 if (!th->ack)
1136                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1137                 kfree_skb(skb, FREE_READ);
1138                 return(0);
1139         }
1140         
1141         /*
1142          *      We no longer have anyone receiving data on this connection.
1143          */
1144 
1145 #ifndef TCP_DONT_RST_SHUTDOWN            
1146 
1147         if(sk->shutdown & RCV_SHUTDOWN)
1148         {
1149                 /*
1150                  *      FIXME: BSD has some magic to avoid sending resets to
1151                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
1152                  *      BSD stacks still have broken keepalives so we want to
1153                  *      cope with it.
1154                  */
1155 
1156                 if(skb->len)    /* We don't care if it's just an ack or
1157                                    a keepalive/window probe */
1158                 {
1159                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
1160                         
1161                         /* Do this the way 4.4BSD treats it. Not what I'd
1162                            regard as the meaning of the spec but it's what BSD
1163                            does and clearly they know everything 8) */
1164 
1165                         /*
1166                          *      This is valid because of two things
1167                          *
1168                          *      a) The way tcp_data behaves at the bottom.
1169                          *      b) A fin takes effect when read not when received.
1170                          */
1171                          
1172                         shut_seq = sk->acked_seq+1;     /* Last byte */
1173                         
1174                         if(after(new_seq,shut_seq))
1175                         {
1176                                 if(sk->debug)
1177                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1178                                                 sk, new_seq, shut_seq, sk->blog);
1179                                 if(sk->dead)
1180                                 {
1181                                         sk->acked_seq = new_seq + th->fin;
1182                                         tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1183                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1184                                         tcp_statistics.TcpEstabResets++;
1185                                         sk->err = EPIPE;
1186                                         sk->error_report(sk);
1187                                         sk->shutdown = SHUTDOWN_MASK;
1188                                         tcp_set_state(sk,TCP_CLOSE);
1189                                         kfree_skb(skb, FREE_READ);
1190                                         return 0;
1191                                 }
1192                         }
1193                 }
1194         }
1195 
1196 #endif
1197 
1198         /*
1199          *      Now we have to walk the chain, and figure out where this one
1200          *      goes into it.  This is set up so that the last packet we received
1201          *      will be the first one we look at, that way if everything comes
1202          *      in order, there will be no performance loss, and if they come
1203          *      out of order we will be able to fit things in nicely.
1204          *
1205          *      [AC: This is wrong. We should assume in order first and then walk
1206          *       forwards from the first hole based upon real traffic patterns.]
1207          *      
1208          */
1209 
1210         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
1211         {
1212                 skb_queue_head(&sk->receive_queue,skb);
1213                 skb1= NULL;
1214         } 
1215         else
1216         {
1217                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
1218                 {
1219                         if(sk->debug)
1220                         {
1221                                 printk("skb1=%p :", skb1);
1222                                 printk("skb1->seq = %d: ", skb1->seq);
1223                                 printk("skb->seq = %d\n",skb->seq);
1224                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
1225                                                 sk->acked_seq);
1226                         }
1227                         
1228                         /*
1229                          *      Optimisation: Duplicate frame or extension of previous frame from
1230                          *      same sequence point (lost ack case).
1231                          *      The frame contains duplicate data or replaces a previous frame
1232                          *      discard the previous frame (safe as sk->inuse is set) and put
1233                          *      the new one in its place.
1234                          */
1235                          
1236                         if (skb->seq==skb1->seq && skb->len>=skb1->len)
1237                         {
1238                                 skb_append(skb1,skb);
1239                                 skb_unlink(skb1);
1240                                 kfree_skb(skb1,FREE_READ);
1241                                 dup_dumped=1;
1242                                 skb1=NULL;
1243                                 break;
1244                         }
1245                         
1246                         /*
1247                          *      Found where it fits
1248                          */
1249                          
1250                         if (after(skb->seq+1, skb1->seq))
1251                         {
1252                                 skb_append(skb1,skb);
1253                                 break;
1254                         }
1255                         
1256                         /*
1257                          *      See if we've hit the start. If so insert.
1258                          */
1259                         if (skb1 == skb_peek(&sk->receive_queue))
1260                         {
1261                                 skb_queue_head(&sk->receive_queue, skb);
1262                                 break;
1263                         }
1264                 }
1265         }
1266 
1267         /*
1268          *      Figure out what the ack value for this frame is
1269          */
1270          
1271         if (before(sk->acked_seq, sk->copied_seq)) 
1272         {
1273                 printk("*** tcp.c:tcp_data bug acked < copied\n");
1274                 sk->acked_seq = sk->copied_seq;
1275         }
1276 
1277         /*
1278          *      Now figure out if we can ack anything. This is very messy because we really want two
1279          *      receive queues, a completed and an assembly queue. We also want only one transmit
1280          *      queue.
1281          */
1282 
1283         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1)) 
1284         {
1285                 if (before(skb->seq, sk->acked_seq+1)) 
1286                 {
1287 
1288                         if (after(skb->end_seq, sk->acked_seq)) 
1289                                 sk->acked_seq = skb->end_seq;
1290 
1291                         skb->acked = 1;
1292 
1293                         /*
1294                          *      When we ack the fin, we do the FIN 
1295                          *      processing.
1296                          */
1297 
1298                         if (skb->h.th->fin) 
1299                         {
1300                                 tcp_fin(skb,sk,skb->h.th);
1301                         }
1302           
1303                         for(skb2 = skb->next;
1304                             skb2 != (struct sk_buff *)&sk->receive_queue;
1305                             skb2 = skb2->next) 
1306                         {
1307                                 if (before(skb2->seq, sk->acked_seq+1)) 
1308                                 {
1309                                         if (after(skb2->end_seq, sk->acked_seq))
1310                                                 sk->acked_seq = skb2->end_seq;
1311 
1312                                         skb2->acked = 1;
1313                                         /*
1314                                          *      When we ack the fin, we do
1315                                          *      the fin handling.
1316                                          */
1317                                         if (skb2->h.th->fin) 
1318                                         {
1319                                                 tcp_fin(skb,sk,skb->h.th);
1320                                         }
1321 
1322                                         /*
1323                                          *      Force an immediate ack.
1324                                          */
1325                                          
1326                                         sk->ack_backlog = sk->max_ack_backlog;
1327                                 }
1328                                 else
1329                                 {
1330                                         break;
1331                                 }
1332                         }
1333 
1334                         /*
1335                          *      This also takes care of updating the window.
1336                          *      This if statement needs to be simplified.
1337                          *
1338                          *      rules for delaying an ack:
1339                          *      - delay time <= 0.5 HZ
1340                          *      - we don't have a window update to send
1341                          *      - must send at least every 2 full sized packets
1342                          */
1343                         if (!sk->delay_acks ||
1344                             sk->ack_backlog >= sk->max_ack_backlog || 
1345                             sk->bytes_rcv > sk->max_unacked || th->fin ||
1346                             sk->ato > HZ/2 ||
1347                             tcp_raise_window(sk)) {
1348         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
1349                         }
1350                         else 
1351                         {
1352                                 sk->ack_backlog++;
1353                                 
1354                                 if(sk->debug)                           
1355                                         printk("Ack queued.\n");
1356                                 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);
1357                         }
1358                 }
1359         }
1360 
1361         /*
1362          *      If we've missed a packet, send an ack.
1363          *      Also start a timer to send another.
1364          */
1365          
1366         if (!skb->acked) 
1367         {
1368         
1369         /*
1370          *      This is important.  If we don't have much room left,
1371          *      we need to throw out a few packets so we have a good
1372          *      window.  Note that mtu is used, not mss, because mss is really
1373          *      for the send side.  He could be sending us stuff as large as mtu.
1374          */
1375                  
1376                 while (sock_rspace(sk) < sk->mtu) 
1377                 {
1378                         skb1 = skb_peek(&sk->receive_queue);
1379                         if (skb1 == NULL) 
1380                         {
1381                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
1382                                 break;
1383                         }
1384 
1385                         /*
1386                          *      Don't throw out something that has been acked. 
1387                          */
1388                  
1389                         if (skb1->acked) 
1390                         {
1391                                 break;
1392                         }
1393                 
1394                         skb_unlink(skb1);
1395                         kfree_skb(skb1, FREE_READ);
1396                 }
1397                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1398                 sk->ack_backlog++;
1399                 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
1400         }
1401         else
1402         {
1403                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1404         }
1405 
1406         /*
1407          *      Now tell the user we may have some data. 
1408          */
1409          
1410         if (!sk->dead) 
1411         {
1412                 if(sk->debug)
1413                         printk("Data wakeup.\n");
1414                 sk->data_ready(sk,0);
1415         } 
1416         return(0);
1417 }
1418 
1419 
1420 /*
1421  *      This routine is only called when we have urgent data
1422  *      signalled. Its the 'slow' part of tcp_urg. It could be
1423  *      moved inline now as tcp_urg is only called from one
1424  *      place. We handle URGent data wrong. We have to - as
1425  *      BSD still doesn't use the correction from RFC961.
1426  */
1427  
1428 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
1429 {
1430         u32 ptr = ntohs(th->urg_ptr);
1431 
1432         if (ptr)
1433                 ptr--;
1434         ptr += ntohl(th->seq);
1435 
1436         /* ignore urgent data that we've already seen and read */
1437         if (after(sk->copied_seq, ptr))
1438                 return;
1439 
1440         /* do we already have a newer (or duplicate) urgent pointer? */
1441         if (sk->urg_data && !after(ptr, sk->urg_seq))
1442                 return;
1443 
1444         /* tell the world about our new urgent pointer */
1445         if (sk->proc != 0) {
1446                 if (sk->proc > 0) {
1447                         kill_proc(sk->proc, SIGURG, 1);
1448                 } else {
1449                         kill_pg(-sk->proc, SIGURG, 1);
1450                 }
1451         }
1452         sk->urg_data = URG_NOTYET;
1453         sk->urg_seq = ptr;
1454 }
1455 
1456 /*
1457  *      This is the 'fast' part of urgent handling.
1458  */
1459  
1460 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
     /* [previous][next][first][last][top][bottom][index][help] */
1461 {
1462         /*
1463          *      Check if we get a new urgent pointer - normally not 
1464          */
1465          
1466         if (th->urg)
1467                 tcp_check_urg(sk,th);
1468 
1469         /*
1470          *      Do we wait for any urgent data? - normally not
1471          */
1472          
1473         if (sk->urg_data == URG_NOTYET) {
1474                 u32 ptr;
1475 
1476                 /*
1477                  *      Is the urgent pointer pointing into this packet? 
1478                  */      
1479                 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1480                 if (ptr < len) {
1481                         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1482                         if (!sk->dead)
1483                                 sk->data_ready(sk,0);
1484                 }
1485         }
1486 }
1487 
1488 
1489 /*
1490  *      A TCP packet has arrived.
1491  *              skb->h.raw is the TCP header.
1492  */
1493  
1494 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
1495         __u32 daddr, unsigned short len,
1496         __u32 saddr, int redo, struct inet_protocol * protocol)
1497 {
1498         struct tcphdr *th;
1499         struct sock *sk;
1500         int syn_ok=0;
1501 
1502         /*
1503          * "redo" is 1 if we have already seen this skb but couldn't
1504          * use it at that time (the socket was locked).  In that case
1505          * we have already done a lot of the work (looked up the socket
1506          * etc).
1507          */
1508         th = skb->h.th;
1509         sk = skb->sk;
1510         if (!redo) {
1511                 tcp_statistics.TcpInSegs++;
1512                 if (skb->pkt_type!=PACKET_HOST)
1513                         goto discard_it;
1514 
1515                 /*
1516                  *      Pull up the IP header.
1517                  */
1518                 skb_pull(skb, skb->h.raw-skb->data);
1519 
1520                 /*
1521                  *      Try to use the device checksum if provided.
1522                  */
1523                 switch (skb->ip_summed) {
1524                         case CHECKSUM_NONE:
1525                                 skb->csum = csum_partial((char *)th, len, 0);
1526                         case CHECKSUM_HW:
1527                                 if (tcp_check(th, len, saddr, daddr, skb->csum))
1528                                         goto discard_it;
1529                         default:
1530                                 /* CHECKSUM_UNNECESSARY */
1531                 }
1532                 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1533                 if (!sk)
1534                         goto no_tcp_socket;
1535                 skb->sk = sk;
1536                 skb->seq = ntohl(th->seq);
1537                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1538                 skb->ack_seq = ntohl(th->ack_seq);
1539 
1540                 skb->acked = 0;
1541                 skb->used = 0;
1542                 skb->free = 0;
1543                 skb->saddr = daddr;
1544                 skb->daddr = saddr;
1545         
1546                 /* We may need to add it to the backlog here. */
1547                 cli();
1548                 if (sk->inuse) 
1549                 {
1550                         skb_queue_tail(&sk->back_log, skb);
1551                         sti();
1552                         return(0);
1553                 }
1554                 sk->inuse = 1;
1555                 sti();
1556         }
1557 
1558         /*
1559          *      If this socket has got a reset it's to all intents and purposes 
1560          *      really dead. Count closed sockets as dead.
1561          *
1562          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1563          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
1564          *      exist so should cause resets as if the port was unreachable.
1565          */
1566 
1567         if (sk->zapped || sk->state==TCP_CLOSE)
1568                 goto no_tcp_socket;
1569 
1570         if (!sk->prot) 
1571         {
1572                 printk("IMPOSSIBLE 3\n");
1573                 return(0);
1574         }
1575 
1576 
1577         /*
1578          *      Charge the memory to the socket. 
1579          */
1580          
1581         skb->sk=sk;
1582         sk->rmem_alloc += skb->truesize;
1583 
1584         /*
1585          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1586          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1587          *      compatibility. We also set up variables more thoroughly [Karn notes in the
1588          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1589          */
1590 
1591         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
1592         {
1593         
1594                 /*
1595                  *      Now deal with unusual cases.
1596                  */
1597          
1598                 if(sk->state==TCP_LISTEN)
1599                 {
1600                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
1601                                 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1602 
1603                         /*
1604                          *      We don't care for RST, and non SYN are absorbed (old segments)
1605                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1606                          *      netmask on a running connection it can go broadcast. Even Sun's have
1607                          *      this problem so I'm ignoring it 
1608                          */
1609                            
1610                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1611                         {
1612                                 kfree_skb(skb, FREE_READ);
1613                                 release_sock(sk);
1614                                 return 0;
1615                         }
1616                 
1617                         /*      
1618                          *      Guess we need to make a new socket up 
1619                          */
1620                 
1621                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1622                 
1623                         /*
1624                          *      Now we have several options: In theory there is nothing else
1625                          *      in the frame. KA9Q has an option to send data with the syn,
1626                          *      BSD accepts data with the syn up to the [to be] advertised window
1627                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
1628                          *      it, that fits the spec precisely and avoids incompatibilities. It
1629                          *      would be nice in future to drop through and process the data.
1630                          */
1631                          
1632                         release_sock(sk);
1633                         return 0;
1634                 }
1635         
1636                 /* retransmitted SYN? */
1637                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1638                 {
1639                         kfree_skb(skb, FREE_READ);
1640                         release_sock(sk);
1641                         return 0;
1642                 }
1643                 
1644                 /*
1645                  *      SYN sent means we have to look for a suitable ack and either reset
1646                  *      for bad matches or go to connected 
1647                  */
1648            
1649                 if(sk->state==TCP_SYN_SENT)
1650                 {
1651                         /* Crossed SYN or previous junk segment */
1652                         if(th->ack)
1653                         {
1654                                 /* We got an ack, but it's not a good ack */
1655                                 if(!tcp_ack(sk,th,skb->ack_seq,len))
1656                                 {
1657                                         /* Reset the ack - its an ack from a 
1658                                            different connection  [ th->rst is checked in tcp_send_reset()] */
1659                                         tcp_statistics.TcpAttemptFails++;
1660                                         tcp_send_reset(daddr, saddr, th,
1661                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1662                                         kfree_skb(skb, FREE_READ);
1663                                         release_sock(sk);
1664                                         return(0);
1665                                 }
1666                                 if(th->rst)
1667                                         return tcp_reset(sk,skb);
1668                                 if(!th->syn)
1669                                 {
1670                                         /* A valid ack from a different connection
1671                                            start. Shouldn't happen but cover it */
1672                                         tcp_statistics.TcpAttemptFails++;
1673                                         tcp_send_reset(daddr, saddr, th,
1674                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1675                                         kfree_skb(skb, FREE_READ);
1676                                         release_sock(sk);
1677                                         return 0;
1678                                 }
1679                                 /*
1680                                  *      Ok.. it's good. Set up sequence numbers and
1681                                  *      move to established.
1682                                  */
1683                                 syn_ok=1;       /* Don't reset this connection for the syn */
1684                                 sk->acked_seq = skb->seq+1;
1685                                 sk->lastwin_seq = skb->seq+1;
1686                                 sk->fin_seq = skb->seq;
1687                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1688                                 tcp_set_state(sk, TCP_ESTABLISHED);
1689                                 tcp_options(sk,th);
1690                                 sk->dummy_th.dest=th->source;
1691                                 sk->copied_seq = sk->acked_seq;
1692                                 if(!sk->dead)
1693                                 {
1694                                         sk->state_change(sk);
1695                                         sock_wake_async(sk->socket, 0);
1696                                 }
1697                                 if(sk->max_window==0)
1698                                 {
1699                                         sk->max_window = 32;
1700                                         sk->mss = min(sk->max_window, sk->mtu);
1701                                 }
1702                         }
1703                         else
1704                         {
1705                                 /* See if SYN's cross. Drop if boring */
1706                                 if(th->syn && !th->rst)
1707                                 {
1708                                         /* Crossed SYN's are fine - but talking to
1709                                            yourself is right out... */
1710                                         if(sk->saddr==saddr && sk->daddr==daddr &&
1711                                                 sk->dummy_th.source==th->source &&
1712                                                 sk->dummy_th.dest==th->dest)
1713                                         {
1714                                                 tcp_statistics.TcpAttemptFails++;
1715                                                 return tcp_reset(sk,skb);
1716                                         }
1717                                         tcp_set_state(sk,TCP_SYN_RECV);
1718                                         
1719                                         /*
1720                                          *      FIXME:
1721                                          *      Must send SYN|ACK here
1722                                          */
1723                                 }               
1724                                 /* Discard junk segment */
1725                                 kfree_skb(skb, FREE_READ);
1726                                 release_sock(sk);
1727                                 return 0;
1728                         }
1729                         /*
1730                          *      SYN_RECV with data maybe.. drop through
1731                          */
1732                         goto rfc_step6;
1733                 }
1734 
1735         /*
1736          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1737          *      a more complex suggestion for fixing these reuse issues in RFC1644
1738          *      but not yet ready for general use. Also see RFC1379.
1739          */
1740         
1741 #define BSD_TIME_WAIT
1742 #ifdef BSD_TIME_WAIT
1743                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
1744                         after(skb->seq, sk->acked_seq) && !th->rst)
1745                 {
1746                         u32 seq = sk->write_seq;
1747                         if(sk->debug)
1748                                 printk("Doing a BSD time wait\n");
1749                         tcp_statistics.TcpEstabResets++;           
1750                         sk->rmem_alloc -= skb->truesize;
1751                         skb->sk = NULL;
1752                         sk->err=ECONNRESET;
1753                         tcp_set_state(sk, TCP_CLOSE);
1754                         sk->shutdown = SHUTDOWN_MASK;
1755                         release_sock(sk);
1756                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1757                         if (sk && sk->state==TCP_LISTEN)
1758                         {
1759                                 sk->inuse=1;
1760                                 skb->sk = sk;
1761                                 sk->rmem_alloc += skb->truesize;
1762                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1763                                 release_sock(sk);
1764                                 return 0;
1765                         }
1766                         kfree_skb(skb, FREE_READ);
1767                         return 0;
1768                 }
1769 #endif  
1770         }
1771 
1772         /*
1773          *      We are now in normal data flow (see the step list in the RFC)
1774          *      Note most of these are inline now. I'll inline the lot when
1775          *      I have time to test it hard and look at what gcc outputs 
1776          */
1777         
1778         if (!tcp_sequence(sk, skb->seq, skb->end_seq))
1779         {
1780                 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1781                 kfree_skb(skb, FREE_READ);
1782                 release_sock(sk);
1783                 return 0;
1784         }
1785 
1786         if(th->rst)
1787                 return tcp_reset(sk,skb);
1788         
1789         /*
1790          *      !syn_ok is effectively the state test in RFC793.
1791          */
1792          
1793         if(th->syn && !syn_ok)
1794         {
1795                 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1796                 return tcp_reset(sk,skb);       
1797         }
1798 
1799 
1800         /*
1801          *      Delayed ACK time estimator.
1802          */
1803         
1804         if (sk->lrcvtime == 0) 
1805         {
1806                 sk->lrcvtime = jiffies;
1807                 sk->ato = HZ/3;
1808         }
1809         else 
1810         {
1811                 int m;
1812                 
1813                 m = jiffies - sk->lrcvtime;
1814 
1815                 sk->lrcvtime = jiffies;
1816 
1817                 if (m <= 0)
1818                         m = 1;
1819 
1820                 if (m > (sk->rtt >> 3)) 
1821                 {
1822                         sk->ato = sk->rtt >> 3;
1823                         /*
1824                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
1825                          */
1826                 }
1827                 else 
1828                 {
1829                         sk->ato = (sk->ato >> 1) + m;
1830                         /*
1831                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
1832                          */
1833                 }
1834         }
1835           
1836         /*
1837          *      Process the ACK
1838          */
1839          
1840 
1841         if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1842         {
1843                 /*
1844                  *      Our three way handshake failed.
1845                  */
1846                  
1847                 if(sk->state==TCP_SYN_RECV)
1848                 {
1849                         tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1850                 }
1851                 kfree_skb(skb, FREE_READ);
1852                 release_sock(sk);
1853                 return 0;
1854         }
1855         
1856 rfc_step6:              /* I'll clean this up later */
1857 
1858         /*
1859          *      If the accepted buffer put us over our queue size we
1860          *      now drop it (we must process the ack first to avoid
1861          *      deadlock cases).
1862          */
1863          
1864         if (sk->rmem_alloc  >= sk->rcvbuf) 
1865         {
1866                 kfree_skb(skb, FREE_READ);
1867                 release_sock(sk);
1868                 return(0);
1869         }
1870 
1871 
1872         /*
1873          *      Process urgent data
1874          */
1875                 
1876         tcp_urg(sk, th, len);
1877         
1878         /*
1879          *      Process the encapsulated data
1880          */
1881         
1882         if(tcp_data(skb,sk, saddr, len))
1883         {
1884                 kfree_skb(skb, FREE_READ);
1885                 release_sock(sk);
1886                 return 0;
1887         }
1888 
1889         /*
1890          *      And done
1891          */     
1892         
1893         release_sock(sk);
1894         return 0;
1895 
1896 no_tcp_socket:
1897         /*
1898          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1899          */
1900         tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1901 
1902 discard_it:
1903         /*
1904          *      Discard frame
1905          */
1906         skb->sk = NULL;
1907         kfree_skb(skb, FREE_READ);
1908         return 0;
1909 }

/* [previous][next][first][last][top][bottom][index][help] */