1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp_input.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * FIXES 23 * Pedro Roque : Double ACK bug 24 */ 25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 #include <linux/interrupt.h>
30
31 /* 32 * Policy code extracted so its now seperate 33 */ 34
35 /* 36 * Called each time to estimate the delayed ack timeout. This is 37 * how it should be done so a fast link isnt impacted by ack delay. 38 */ 39
40 extern__inline__voidtcp_delack_estimator(structsock *sk)
/* */ 41 { 42 /* 43 * Delayed ACK time estimator. 44 */ 45
46 if (sk->lrcvtime == 0)
47 { 48 sk->lrcvtime = jiffies;
49 sk->ato = HZ/3;
50 } 51 else 52 { 53 intm;
54
55 m = jiffies - sk->lrcvtime;
56
57 sk->lrcvtime = jiffies;
58
59 if (m <= 0)
60 m = 1;
61
62 if (m > (sk->rtt >> 3))
63 { 64 sk->ato = sk->rtt >> 3;
65 /* 66 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); 67 */ 68 } 69 else 70 { 71 sk->ato = (sk->ato >> 1) + m;
72 /* 73 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); 74 */ 75 } 76 } 77 } 78
79 /* 80 * Called on frames that were known _not_ to have been 81 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 82 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. 83 */ 84
85 extern__inline__voidtcp_rtt_estimator(structsock *sk, structsk_buff *oskb)
/* */ 86 { 87 longm;
88 /* 89 * The following amusing code comes from Jacobson's 90 * article in SIGCOMM '88. Note that rtt and mdev 91 * are scaled versions of rtt and mean deviation. 92 * This is designed to be as fast as possible 93 * m stands for "measurement". 94 */ 95
96 m = jiffies - oskb->when; /* RTT */ 97 if(m<=0)
98 m=1; /* IS THIS RIGHT FOR <0 ??? */ 99 m -= (sk->rtt >> 3); /* m is now error in rtt est */ 100 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ 101 if (m < 0)
102 m = -m; /* m is now abs(error) */ 103 m -= (sk->mdev >> 2); /* similar update on mdev */ 104 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 105
106 /* 107 * Now update timeout. Note that this removes any backoff. 108 */ 109
110 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
111 if (sk->rto > 120*HZ)
112 sk->rto = 120*HZ;
113 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ 114 sk->rto = HZ/5;
115 sk->backoff = 0;
116 } 117
118 /* 119 * Cached last hit socket 120 */ 121
122 staticvolatileunsignedlongth_cache_saddr, th_cache_daddr;
123 staticvolatileunsignedshortth_cache_dport, th_cache_sport;
124 staticvolatilestructsock *th_cache_sk;
125
126 voidtcp_cache_zap(void)
/* */ 127 { 128 th_cache_sk=NULL;
129 } 130
131 /* 132 * Find the socket, using the last hit cache if applicable. The cache is not quite 133 * right... 134 */ 135
136 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */ 137 { 138 structsock * sk;
139
140 sk = (structsock *) th_cache_sk;
141 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
142 sport != th_cache_sport || dport != th_cache_dport) { 143 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
144 if (sk) { 145 th_cache_saddr=saddr;
146 th_cache_daddr=daddr;
147 th_cache_dport=dport;
148 th_cache_sport=sport;
149 th_cache_sk=sk;
150 } 151 } 152 returnsk;
153 } 154
155 /* 156 * React to a out-of-window TCP sequence number in an incoming packet 157 */ 158
159 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */ 160 structoptions *opt, unsignedlongsaddr, structdevice *dev)
161 { 162 if (th->rst)
163 return;
164
165 /* 166 * Send a reset if we get something not ours and we are 167 * unsynchronized. Note: We don't do anything to our end. We 168 * are just killing the bogus remote connection then we will 169 * connect again and it will work (with luck). 170 */ 171
172 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
173 { 174 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
175 return;
176 } 177
178 /* 179 * 4.3reno machines look for these kind of acks so they can do fast 180 * recovery. Three identical 'old' acks lets it know that one frame has 181 * been lost and should be resent. Because this is before the whole window 182 * of data has timed out it can take one lost frame per window without 183 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2] 184 * 185 * We also should be spotting triple bad sequences. 186 */ 187 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
188 return;
189 } 190
191 /* 192 * This functions checks to see if the tcp header is actually acceptable. 193 */ 194
195 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */ 196 { 197 u32end_window = sk->acked_seq + sk->window;
198 return/* if start is at end of window, end must be too (zero window) */ 199 (seq == end_window && seq == end_seq) ||
200 /* if start is before end of window, check for interest */ 201 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
202 } 203
204 /* 205 * When we get a reset we do this. This probably is a tcp_output routine 206 * really. 207 */ 208
209 staticinttcp_reset(structsock *sk, structsk_buff *skb)
/* */ 210 { 211 sk->zapped = 1;
212 /* 213 * We want the right error as BSD sees it (and indeed as we do). 214 */ 215 sk->err = ECONNRESET;
216 if (sk->state == TCP_SYN_SENT)
217 sk->err = ECONNREFUSED;
218 if (sk->state == TCP_CLOSE_WAIT)
219 sk->err = EPIPE;
220 #ifdef CONFIG_TCP_RFC1337
221 /* 222 * Time wait assassination protection [RFC1337] 223 * 224 * This is a good idea, but causes more sockets to take time to close. 225 * 226 * Ian Heavens has since shown this is an inadequate fix for the protocol 227 * bug in question. 228 */ 229 if(sk->state!=TCP_TIME_WAIT)
230 { 231 tcp_set_state(sk,TCP_CLOSE);
232 sk->shutdown = SHUTDOWN_MASK;
233 } 234 #else 235 tcp_set_state(sk,TCP_CLOSE);
236 sk->shutdown = SHUTDOWN_MASK;
237 #endif 238 if (!sk->dead)
239 sk->state_change(sk);
240 kfree_skb(skb, FREE_READ);
241 return(0);
242 } 243
244
245 /* 246 * Look for tcp options. Parses everything but only knows about MSS. 247 * This routine is always called with the packet containing the SYN. 248 * However it may also be called with the ack to the SYN. So you 249 * can't assume this is always the SYN. It's always called after 250 * we have set up sk->mtu to our own MTU. 251 * 252 * We need at minimum to add PAWS support here. Possibly large windows 253 * as Linux gets deployed on 100Mb/sec networks. 254 */ 255
256 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */ 257 { 258 unsignedchar *ptr;
259 intlength=(th->doff*4)-sizeof(structtcphdr);
260 intmss_seen = 0;
261
262 ptr = (unsignedchar *)(th + 1);
263
264 while(length>0)
265 { 266 intopcode=*ptr++;
267 intopsize=*ptr++;
268 switch(opcode)
269 { 270 caseTCPOPT_EOL:
271 return;
272 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 273 length--;
274 ptr--; /* the opsize=*ptr++ above was a mistake */ 275 continue;
276
277 default:
278 if(opsize<=2) /* Avoid silly options looping forever */ 279 return;
280 switch(opcode)
281 { 282 caseTCPOPT_MSS:
283 if(opsize==4 && th->syn)
284 { 285 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
286 mss_seen = 1;
287 } 288 break;
289 /* Add other options here as people feel the urge to implement stuff like large windows */ 290 } 291 ptr+=opsize-2;
292 length-=opsize;
293 } 294 } 295 if (th->syn)
296 { 297 if (! mss_seen)
298 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ 299 } 300 #ifdefCONFIG_INET_PCTCP 301 sk->mss = min(sk->max_window >> 1, sk->mtu);
302 #else 303 sk->mss = min(sk->max_window, sk->mtu);
304 sk->max_unacked = 2 * sk->mss;
305 #endif 306 } 307
308
309 /* 310 * This routine handles a connection request. 311 * It should make sure we haven't already responded. 312 * Because of the way BSD works, we have to send a syn/ack now. 313 * This also means it will be harder to close a socket which is 314 * listening. 315 */ 316
317 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */ 318 u32daddr, u32saddr, structoptions *opt, structdevice *dev, u32seq)
319 { 320 structsock *newsk;
321 structtcphdr *th;
322 structrtable *rt;
323
324 th = skb->h.th;
325
326 /* If the socket is dead, don't accept the connection. */ 327 if (!sk->dead)
328 { 329 sk->data_ready(sk,0);
330 } 331 else 332 { 333 if(sk->debug)
334 printk("Reset on %p: Connect on dead socket.\n",sk);
335 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
336 tcp_statistics.TcpAttemptFails++;
337 kfree_skb(skb, FREE_READ);
338 return;
339 } 340
341 /* 342 * Make sure we can accept more. This will prevent a 343 * flurry of syns from eating up all our memory. 344 * 345 * BSD does some funnies here and allows 3/2 times the 346 * set backlog as a fudge factor. Thats just too gross. 347 */ 348
349 if (sk->ack_backlog >= sk->max_ack_backlog)
350 { 351 tcp_statistics.TcpAttemptFails++;
352 kfree_skb(skb, FREE_READ);
353 return;
354 } 355
356 /* 357 * We need to build a new sock struct. 358 * It is sort of bad to have a socket without an inode attached 359 * to it, but the wake_up's will just wake up the listening socket, 360 * and if the listening socket is destroyed before this is taken 361 * off of the queue, this will take care of it. 362 */ 363
364 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
365 if (newsk == NULL)
366 { 367 /* just ignore the syn. It will get retransmitted. */ 368 tcp_statistics.TcpAttemptFails++;
369 kfree_skb(skb, FREE_READ);
370 return;
371 } 372
373 memcpy(newsk, sk, sizeof(*newsk));
374 newsk->opt = NULL;
375 newsk->ip_route_cache = NULL;
376 if (opt && opt->optlen)
377 { 378 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
379 if (!sk->opt)
380 { 381 kfree_s(newsk, sizeof(structsock));
382 tcp_statistics.TcpAttemptFails++;
383 kfree_skb(skb, FREE_READ);
384 return;
385 } 386 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
387 { 388 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
389 kfree_s(newsk, sizeof(structsock));
390 tcp_statistics.TcpAttemptFails++;
391 kfree_skb(skb, FREE_READ);
392 return;
393 } 394 } 395 skb_queue_head_init(&newsk->write_queue);
396 skb_queue_head_init(&newsk->receive_queue);
397 newsk->send_head = NULL;
398 newsk->send_tail = NULL;
399 skb_queue_head_init(&newsk->back_log);
400 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ 401 newsk->rto = TCP_TIMEOUT_INIT;
402 newsk->mdev = 0;
403 newsk->max_window = 0;
404 newsk->cong_window = 1;
405 newsk->cong_count = 0;
406 newsk->ssthresh = 0;
407 newsk->backoff = 0;
408 newsk->blog = 0;
409 newsk->intr = 0;
410 newsk->proc = 0;
411 newsk->done = 0;
412 newsk->partial = NULL;
413 newsk->pair = NULL;
414 newsk->wmem_alloc = 0;
415 newsk->rmem_alloc = 0;
416 newsk->localroute = sk->localroute;
417
418 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
419
420 newsk->err = 0;
421 newsk->shutdown = 0;
422 newsk->ack_backlog = 0;
423 newsk->acked_seq = skb->seq+1;
424 newsk->lastwin_seq = skb->seq+1;
425 newsk->delay_acks = 1;
426 newsk->copied_seq = skb->seq+1;
427 newsk->fin_seq = skb->seq;
428 newsk->state = TCP_SYN_RECV;
429 newsk->timeout = 0;
430 newsk->ip_xmit_timeout = 0;
431 newsk->write_seq = seq;
432 newsk->window_seq = newsk->write_seq;
433 newsk->rcv_ack_seq = newsk->write_seq;
434 newsk->urg_data = 0;
435 newsk->retransmits = 0;
436 newsk->linger=0;
437 newsk->destroy = 0;
438 init_timer(&newsk->timer);
439 newsk->timer.data = (unsignedlong)newsk;
440 newsk->timer.function = &net_timer;
441 init_timer(&newsk->retransmit_timer);
442 newsk->retransmit_timer.data = (unsignedlong)newsk;
443 newsk->retransmit_timer.function=&tcp_retransmit_timer;
444 newsk->dummy_th.source = skb->h.th->dest;
445 newsk->dummy_th.dest = skb->h.th->source;
446
447 /* 448 * Swap these two, they are from our point of view. 449 */ 450
451 newsk->daddr = saddr;
452 newsk->saddr = daddr;
453 newsk->rcv_saddr = daddr;
454
455 put_sock(newsk->num,newsk);
456 newsk->acked_seq = skb->seq + 1;
457 newsk->copied_seq = skb->seq + 1;
458 newsk->socket = NULL;
459
460 /* 461 * Grab the ttl and tos values and use them 462 */ 463
464 newsk->ip_ttl=sk->ip_ttl;
465 newsk->ip_tos=skb->ip_hdr->tos;
466
467 /* 468 * Use 512 or whatever user asked for 469 */ 470
471 /* 472 * Note use of sk->user_mss, since user has no direct access to newsk 473 */ 474
475 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
476 newsk->ip_route_cache = rt;
477
478 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
479 newsk->window_clamp = rt->rt_window;
480 else 481 newsk->window_clamp = 0;
482
483 if (sk->user_mss)
484 newsk->mtu = sk->user_mss;
485 elseif (rt)
486 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
487 else 488 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
489
490 /* 491 * But not bigger than device MTU 492 */ 493
494 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
495
496 #ifdefCONFIG_SKIP 497
498 /* 499 * SKIP devices set their MTU to 65535. This is so they can take packets 500 * unfragmented to security process then fragment. They could lie to the 501 * TCP layer about a suitable MTU, but its easier to let skip sort it out 502 * simply because the final package we want unfragmented is going to be 503 * 504 * [IPHDR][IPSP][Security data][Modified TCP data][Security data] 505 */ 506
507 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ 508 sk->mtu=skip_pick_mtu(sk->mtu,dev);
509 #endif 510 /* 511 * This will min with what arrived in the packet 512 */ 513
514 tcp_options(newsk,skb->h.th);
515
516 tcp_cache_zap();
517 tcp_send_synack(newsk, sk, skb);
518 } 519
520
521 /* 522 * Handle a TCP window that shrunk on us. It shouldn't happen, 523 * but.. 524 * 525 * We may need to move packets from the send queue 526 * to the write queue, if the window has been shrunk on us. 527 * The RFC says you are not allowed to shrink your window 528 * like this, but if the other end does, you must be able 529 * to deal with it. 530 */ 531 voidtcp_window_shrunk(structsock * sk, u32window_seq)
/* */ 532 { 533 structsk_buff *skb;
534 structsk_buff *skb2;
535 structsk_buff *wskb = NULL;
536
537 skb2 = sk->send_head;
538 sk->send_head = NULL;
539 sk->send_tail = NULL;
540
541 /* 542 * This is an artifact of a flawed concept. We want one 543 * queue and a smarter send routine when we send all. 544 */ 545 cli();
546 while (skb2 != NULL)
547 { 548 skb = skb2;
549 skb2 = skb->link3;
550 skb->link3 = NULL;
551 if (after(skb->end_seq, window_seq))
552 { 553 if (sk->packets_out > 0)
554 sk->packets_out--;
555 /* We may need to remove this from the dev send list. */ 556 if (skb->next != NULL)
557 { 558 skb_unlink(skb);
559 } 560 /* Now add it to the write_queue. */ 561 if (wskb == NULL)
562 skb_queue_head(&sk->write_queue,skb);
563 else 564 skb_append(wskb,skb);
565 wskb = skb;
566 } 567 else 568 { 569 if (sk->send_head == NULL)
570 { 571 sk->send_head = skb;
572 sk->send_tail = skb;
573 } 574 else 575 { 576 sk->send_tail->link3 = skb;
577 sk->send_tail = skb;
578 } 579 skb->link3 = NULL;
580 } 581 } 582 sti();
583 } 584
585
586 /* 587 * This routine deals with incoming acks, but not outgoing ones. 588 * 589 * This routine is totally _WRONG_. The list structuring is wrong, 590 * the algorithm is wrong, the code is wrong. 591 */ 592
593 staticinttcp_ack(structsock *sk, structtcphdr *th, u32ack, intlen)
/* */ 594 { 595 intflag = 0;
596 u32window_seq;
597
598 /* 599 * 1 - there was data in packet as well as ack or new data is sent or 600 * in shutdown state 601 * 2 - data from retransmit queue was acked and removed 602 * 4 - window shrunk or data from retransmit queue was acked and removed 603 */ 604
605 if(sk->zapped)
606 return(1); /* Dead, cant ack any more so why bother */ 607
608 /* 609 * We have dropped back to keepalive timeouts. Thus we have 610 * no retransmits pending. 611 */ 612
613 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
614 sk->retransmits = 0;
615
616 /* 617 * If the ack is newer than sent or older than previous acks 618 * then we can probably ignore it. 619 */ 620
621 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
622 gotouninteresting_ack;
623
624 /* 625 * If there is data set flag 1 626 */ 627
628 if (len != th->doff*4)
629 flag |= 1;
630
631 /* 632 * Have we discovered a larger window 633 */ 634 window_seq = ntohs(th->window);
635 if (window_seq > sk->max_window)
636 { 637 sk->max_window = window_seq;
638 #ifdefCONFIG_INET_PCTCP 639 /* Hack because we don't send partial packets to non SWS 640 handling hosts */ 641 sk->mss = min(window_seq>>1, sk->mtu);
642 #else 643 sk->mss = min(window_seq, sk->mtu);
644 #endif 645 } 646 window_seq += ack;
647
648 /* 649 * See if our window has been shrunk. 650 */ 651 if (after(sk->window_seq, window_seq)) { 652 flag |= 4;
653 tcp_window_shrunk(sk, window_seq);
654 } 655
656 /* 657 * Update the right hand window edge of the host 658 */ 659 sk->window_seq = window_seq;
660
661 /* 662 * Pipe has emptied 663 */ 664 if (sk->send_tail == NULL || sk->send_head == NULL)
665 { 666 sk->send_head = NULL;
667 sk->send_tail = NULL;
668 sk->packets_out= 0;
669 } 670
671 /* 672 * We don't want too many packets out there. 673 */ 674
675 if (sk->ip_xmit_timeout == TIME_WRITE &&
676 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
677 { 678
679 /* 680 * This is Jacobson's slow start and congestion avoidance. 681 * SIGCOMM '88, p. 328. Because we keep cong_window in integral 682 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 683 * counter and increment it once every cwnd times. It's possible 684 * that this should be done only if sk->retransmits == 0. I'm 685 * interpreting "new data is acked" as including data that has 686 * been retransmitted but is just now being acked. 687 */ 688 if (sk->cong_window < sk->ssthresh)
689 /* 690 * In "safe" area, increase 691 */ 692 sk->cong_window++;
693 else 694 { 695 /* 696 * In dangerous area, increase slowly. In theory this is 697 * sk->cong_window += 1 / sk->cong_window 698 */ 699 if (sk->cong_count >= sk->cong_window)
700 { 701 sk->cong_window++;
702 sk->cong_count = 0;
703 } 704 else 705 sk->cong_count++;
706 } 707 } 708
709 /* 710 * Remember the highest ack received. 711 */ 712
713 sk->rcv_ack_seq = ack;
714
715 /* 716 * We passed data and got it acked, remove any soft error 717 * log. Something worked... 718 */ 719
720 sk->err_soft = 0;
721
722 /* 723 * If this ack opens up a zero window, clear backoff. It was 724 * being used to time the probes, and is probably far higher than 725 * it needs to be for normal retransmission. 726 */ 727
728 if (sk->ip_xmit_timeout == TIME_PROBE0)
729 { 730 sk->retransmits = 0; /* Our probe was answered */ 731
732 /* 733 * Was it a usable window open ? 734 */ 735
736 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */ 737 ! before (sk->window_seq, sk->write_queue.next->end_seq))
738 { 739 sk->backoff = 0;
740
741 /* 742 * Recompute rto from rtt. this eliminates any backoff. 743 */ 744
745 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
746 if (sk->rto > 120*HZ)
747 sk->rto = 120*HZ;
748 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about 749 .2 of a second because of BSD delayed acks - on a 100Mb/sec link 750 .2 of a second is going to need huge windows (SIGH) */ 751 sk->rto = HZ/5;
752 } 753 } 754
755 /* 756 * See if we can take anything off of the retransmit queue. 757 */ 758
759 while(sk->send_head != NULL)
760 { 761 /* Check for a bug. */ 762 if (sk->send_head->link3 &&
763 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
764 printk("INET: tcp.c: *** bug send_list out of order.\n");
765
766 /* 767 * If our packet is before the ack sequence we can 768 * discard it as it's confirmed to have arrived the other end. 769 */ 770
771 if (before(sk->send_head->end_seq, ack+1))
772 { 773 structsk_buff *oskb;
774 if (sk->retransmits)
775 { 776 /* 777 * We were retransmitting. don't count this in RTT est 778 */ 779 flag |= 2;
780
781 /* 782 * even though we've gotten an ack, we're still 783 * retransmitting as long as we're sending from 784 * the retransmit queue. Keeping retransmits non-zero 785 * prevents us from getting new data interspersed with 786 * retransmissions. 787 */ 788
789 if (sk->send_head->link3) /* Any more queued retransmits? */ 790 sk->retransmits = 1;
791 else 792 sk->retransmits = 0;
793 } 794 /* 795 * Note that we only reset backoff and rto in the 796 * rtt recomputation code. And that doesn't happen 797 * if there were retransmissions in effect. So the 798 * first new packet after the retransmissions is 799 * sent with the backoff still in effect. Not until 800 * we get an ack from a non-retransmitted packet do 801 * we reset the backoff and rto. This allows us to deal 802 * with a situation where the network delay has increased 803 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) 804 */ 805
806 /* 807 * We have one less packet out there. 808 */ 809
810 if (sk->packets_out > 0)
811 sk->packets_out --;
812
813 oskb = sk->send_head;
814
815 if (!(flag&2)) /* Not retransmitting */ 816 tcp_rtt_estimator(sk,oskb);
817 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 818 In this case as we just set it up */ 819 cli();
820 oskb = sk->send_head;
821 IS_SKB(oskb);
822 sk->send_head = oskb->link3;
823 if (sk->send_head == NULL)
824 { 825 sk->send_tail = NULL;
826 } 827
828 /* 829 * We may need to remove this from the dev send list. 830 */ 831
832 if (oskb->next)
833 skb_unlink(oskb);
834 sti();
835 kfree_skb(oskb, FREE_WRITE); /* write. */ 836 if (!sk->dead)
837 sk->write_space(sk);
838 } 839 else 840 { 841 break;
842 } 843 } 844
845 /* 846 * XXX someone ought to look at this too.. at the moment, if skb_peek() 847 * returns non-NULL, we complete ignore the timer stuff in the else 848 * clause. We ought to organize the code so that else clause can 849 * (should) be executed regardless, possibly moving the PROBE timer 850 * reset over. The skb_peek() thing should only move stuff to the 851 * write queue, NOT also manage the timer functions. 852 */ 853
854 /* 855 * Maybe we can take some stuff off of the write queue, 856 * and put it onto the xmit queue. 857 */ 858 if (skb_peek(&sk->write_queue) != NULL)
859 { 860 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
861 (sk->retransmits == 0 ||
862 sk->ip_xmit_timeout != TIME_WRITE ||
863 !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
864 && sk->packets_out < sk->cong_window)
865 { 866 /* 867 * Add more data to the send queue. 868 */ 869 flag |= 1;
870 tcp_write_xmit(sk);
871 } 872 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
873 sk->send_head == NULL &&
874 sk->ack_backlog == 0 &&
875 sk->state != TCP_TIME_WAIT)
876 { 877 /* 878 * Data to queue but no room. 879 */ 880 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
881 } 882 } 883 else 884 { 885 /* 886 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets 887 * from TCP_CLOSE we don't do anything 888 * 889 * from anything else, if there is write data (or fin) pending, 890 * we use a TIME_WRITE timeout, else if keepalive we reset to 891 * a KEEPALIVE timeout, else we delete the timer. 892 * 893 * We do not set flag for nominal write data, otherwise we may 894 * force a state where we start to write itsy bitsy tidbits 895 * of data. 896 */ 897
898 switch(sk->state) { 899 caseTCP_TIME_WAIT:
900 /* 901 * keep us in TIME_WAIT until we stop getting packets, 902 * reset the timeout. 903 */ 904 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
905 break;
906 caseTCP_CLOSE:
907 /* 908 * don't touch the timer. 909 */ 910 break;
911 default:
912 /* 913 * Must check send_head, write_queue, and ack_backlog 914 * to determine which timeout to use. 915 */ 916 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) { 917 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
918 }elseif (sk->keepopen) { 919 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
920 }else{ 921 del_timer(&sk->retransmit_timer);
922 sk->ip_xmit_timeout = 0;
923 } 924 break;
925 } 926 } 927
928 /* 929 * We have nothing queued but space to send. Send any partial 930 * packets immediately (end of Nagle rule application). 931 */ 932
933 if (sk->packets_out == 0 && sk->partial != NULL &&
934 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
935 { 936 flag |= 1;
937 tcp_send_partial(sk);
938 } 939
940 /* 941 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and 942 * we are now waiting for an acknowledge to our FIN. The other end is 943 * already in TIME_WAIT. 944 * 945 * Move to TCP_CLOSE on success. 946 */ 947
948 if (sk->state == TCP_LAST_ACK)
949 { 950 if (!sk->dead)
951 sk->state_change(sk);
952 if(sk->debug)
953 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
954 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
955 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
956 { 957 flag |= 1;
958 sk->shutdown = SHUTDOWN_MASK;
959 tcp_set_state(sk,TCP_CLOSE);
960 return 1;
961 } 962 } 963
964 /* 965 * Incoming ACK to a FIN we sent in the case of our initiating the close. 966 * 967 * Move to FIN_WAIT2 to await a FIN from the other end. Set 968 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. 969 */ 970
971 if (sk->state == TCP_FIN_WAIT1)
972 { 973
974 if (!sk->dead)
975 sk->state_change(sk);
976 if (sk->rcv_ack_seq == sk->write_seq)
977 { 978 flag |= 1;
979 sk->shutdown |= SEND_SHUTDOWN;
980 tcp_set_state(sk, TCP_FIN_WAIT2);
981 } 982 } 983
984 /* 985 * Incoming ACK to a FIN we sent in the case of a simultaneous close. 986 * 987 * Move to TIME_WAIT 988 */ 989
990 if (sk->state == TCP_CLOSING)
991 { 992
993 if (!sk->dead)
994 sk->state_change(sk);
995 if (sk->rcv_ack_seq == sk->write_seq)
996 { 997 flag |= 1;
998 tcp_time_wait(sk);
999 }1000 }1001
1002 /*1003 * Final ack of a three way shake 1004 */1005
1006 if(sk->state==TCP_SYN_RECV)
1007 {1008 tcp_set_state(sk, TCP_ESTABLISHED);
1009 tcp_options(sk,th);
1010 sk->dummy_th.dest=th->source;
1011 sk->copied_seq = sk->acked_seq;
1012 if(!sk->dead)
1013 sk->state_change(sk);
1014 if(sk->max_window==0)
1015 {1016 sk->max_window=32; /* Sanity check */1017 sk->mss=min(sk->max_window,sk->mtu);
1018 }1019 }1020
1021 /*1022 * I make no guarantees about the first clause in the following1023 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under1024 * what conditions "!flag" would be true. However I think the rest1025 * of the conditions would prevent that from causing any1026 * unnecessary retransmission. 1027 * Clearly if the first packet has expired it should be 1028 * retransmitted. The other alternative, "flag&2 && retransmits", is1029 * harder to explain: You have to look carefully at how and when the1030 * timer is set and with what timeout. The most recent transmission always1031 * sets the timer. So in general if the most recent thing has timed1032 * out, everything before it has as well. So we want to go ahead and1033 * retransmit some more. If we didn't explicitly test for this1034 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"1035 * would not be true. If you look at the pattern of timing, you can1036 * show that rto is increased fast enough that the next packet would1037 * almost never be retransmitted immediately. Then you'd end up1038 * waiting for a timeout to send each packet on the retransmission1039 * queue. With my implementation of the Karn sampling algorithm,1040 * the timeout would double each time. The net result is that it would1041 * take a hideous amount of time to recover from a single dropped packet.1042 * It's possible that there should also be a test for TIME_WRITE, but1043 * I think as long as "send_head != NULL" and "retransmit" is on, we've1044 * got to be in real retransmission mode.1045 * Note that tcp_do_retransmit is called with all==1. Setting cong_window1046 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.1047 * As long as no further losses occur, this seems reasonable.1048 */1049
1050 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1051 (((flag&2) && sk->retransmits) ||
1052 (sk->send_head->when + sk->rto < jiffies)))
1053 {1054 if(sk->send_head->when + sk->rto < jiffies)
1055 tcp_retransmit(sk,0);
1056 else1057 {1058 tcp_do_retransmit(sk, 1);
1059 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1060 }1061 }1062
1063 return 1;
1064
1065 uninteresting_ack:
1066 if(sk->debug)
1067 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1068
1069 /*1070 * Keepalive processing.1071 */1072
1073 if (after(ack, sk->sent_seq))
1074 {1075 return 0;
1076 }1077
1078 /*1079 * Restart the keepalive timer.1080 */1081
1082 if (sk->keepopen)
1083 {1084 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1085 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1086 }1087 return 1;
1088 }1089
1090
1091 /*1092 * Process the FIN bit. This now behaves as it is supposed to work1093 * and the FIN takes effect when it is validly part of sequence1094 * space. Not before when we get holes.1095 *1096 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT1097 * (and thence onto LAST-ACK and finally, CLOSE, we never enter1098 * TIME-WAIT)1099 *1100 * If we are in FINWAIT-1, a received FIN indicates simultaneous1101 * close and we go into CLOSING (and later onto TIME-WAIT)1102 *1103 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.1104 *1105 */1106
1107 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */1108 {1109 sk->fin_seq = skb->end_seq;
1110
1111 if (!sk->dead)
1112 {1113 sk->state_change(sk);
1114 sock_wake_async(sk->socket, 1);
1115 }1116
1117 switch(sk->state)
1118 {1119 caseTCP_SYN_RECV:
1120 caseTCP_SYN_SENT:
1121 caseTCP_ESTABLISHED:
1122 /*1123 * move to CLOSE_WAIT, tcp_data() already handled1124 * sending the ack.1125 */1126 tcp_set_state(sk,TCP_CLOSE_WAIT);
1127 if (th->rst)
1128 sk->shutdown = SHUTDOWN_MASK;
1129 break;
1130
1131 caseTCP_CLOSE_WAIT:
1132 caseTCP_CLOSING:
1133 /*1134 * received a retransmission of the FIN, do1135 * nothing.1136 */1137 break;
1138 caseTCP_TIME_WAIT:
1139 /*1140 * received a retransmission of the FIN,1141 * restart the TIME_WAIT timer.1142 */1143 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1144 return(0);
1145 caseTCP_FIN_WAIT1:
1146 /*1147 * This case occurs when a simultaneous close1148 * happens, we must ack the received FIN and1149 * enter the CLOSING state.1150 *1151 * This causes a WRITE timeout, which will either1152 * move on to TIME_WAIT when we timeout, or resend1153 * the FIN properly (maybe we get rid of that annoying1154 * FIN lost hang). The TIME_WRITE code is already correct1155 * for handling this timeout.1156 */1157
1158 if(sk->ip_xmit_timeout != TIME_WRITE)
1159 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1160 tcp_set_state(sk,TCP_CLOSING);
1161 break;
1162 caseTCP_FIN_WAIT2:
1163 /*1164 * received a FIN -- send ACK and enter TIME_WAIT1165 */1166 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1167 sk->shutdown|=SHUTDOWN_MASK;
1168 tcp_set_state(sk,TCP_TIME_WAIT);
1169 break;
1170 caseTCP_CLOSE:
1171 /*1172 * already in CLOSE1173 */1174 break;
1175 default:
1176 tcp_set_state(sk,TCP_LAST_ACK);
1177
1178 /* Start the timers. */1179 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1180 return(0);
1181 }1182
1183 return(0);
1184 }1185
1186 /*1187 * Add a sk_buff to the TCP receive queue, calculating1188 * the ACK sequence as we go..1189 */1190 staticinlinevoidtcp_insert_skb(structsk_buff * skb, structsk_buff_head * list)
/* */1191 {1192 structsk_buff * prev, * next;
1193 u32seq;
1194
1195 /*1196 * Find where the new skb goes.. (This goes backwards,1197 * on the assumption that we get the packets in order)1198 */1199 seq = skb->seq;
1200 prev = list->prev;
1201 next = (structsk_buff *) list;
1202 for (;;) {1203 if (prev == (structsk_buff *) list || !after(prev->seq, seq))
1204 break;
1205 next = prev;
1206 prev = prev->prev;
1207 }1208 __skb_insert(skb, prev, next, list);
1209 }1210
1211 /*1212 * Called for each packet when we find a new ACK endpoint sequence in it1213 */1214 staticinlineu32tcp_queue_ack(structsk_buff * skb, structsock * sk)
/* */1215 {1216 /*1217 * When we ack the fin, we do the FIN 1218 * processing.1219 */1220 skb->acked = 1;
1221 if (skb->h.th->fin)
1222 tcp_fin(skb,sk,skb->h.th);
1223 returnskb->end_seq;
1224 }1225
1226 staticvoidtcp_queue(structsk_buff * skb, structsock * sk,
/* */1227 structtcphdr *th, unsignedlongsaddr)
1228 {1229 u32ack_seq;
1230
1231 tcp_insert_skb(skb, &sk->receive_queue);
1232 /*1233 * Did we get anything new to ack?1234 */1235 ack_seq = sk->acked_seq;
1236 if (!after(skb->seq, ack_seq) && after(skb->end_seq, ack_seq)) {1237 structsk_buff_head * list = &sk->receive_queue;
1238 structsk_buff * next;
1239 ack_seq = tcp_queue_ack(skb, sk);
1240
1241 /*1242 * Do we have any old packets to ack that the above1243 * made visible? (Go forward from skb)1244 */1245 next = skb->next;
1246 while (next != (structsk_buff *) list) {1247 if (after(next->seq, ack_seq))
1248 break;
1249 if (after(next->end_seq, ack_seq))
1250 ack_seq = tcp_queue_ack(next, sk);
1251 next = next->next;
1252 }1253
1254 /*1255 * Ok, we found new data, update acked_seq as1256 * necessary (and possibly send the actual1257 * ACK packet).1258 */1259 sk->acked_seq = ack_seq;
1260
1261 /*1262 * rules for delaying an ack:1263 * - delay time <= 0.5 HZ1264 * - must send at least every 2 full sized packets1265 * - we don't have a window update to send1266 *1267 * We handle the window update in the actual read1268 * side, so we only have to worry about the first two.1269 */1270 if (!sk->delay_acks || th->fin) {1271 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1272 }1273 else1274 {1275 inttimeout = sk->ato;
1276 if (timeout > HZ/2)
1277 timeout = HZ/2;
1278 if (sk->bytes_rcv > sk->max_unacked) {1279 timeout = 0;
1280 mark_bh(TIMER_BH);
1281 }1282 sk->ack_backlog++;
1283 if(sk->debug)
1284 printk("Ack queued.\n");
1285 tcp_reset_xmit_timer(sk, TIME_WRITE, timeout);
1286 }1287 }1288 }1289
1290
1291 /*1292 * This routine handles the data. If there is room in the buffer,1293 * it will be have already been moved into it. If there is no1294 * room, then we will just have to discard the packet.1295 */1296
1297 staticinttcp_data(structsk_buff *skb, structsock *sk,
/* */1298 unsignedlongsaddr, unsignedshortlen)
1299 {1300 structtcphdr *th;
1301 u32new_seq, shut_seq;
1302
1303 th = skb->h.th;
1304 skb_pull(skb,th->doff*4);
1305 skb_trim(skb,len-(th->doff*4));
1306
1307 /*1308 * The bytes in the receive read/assembly queue has increased. Needed for the1309 * low memory discard algorithm 1310 */1311
1312 sk->bytes_rcv += skb->len;
1313
1314 if (skb->len == 0 && !th->fin)
1315 {1316 /* 1317 * Don't want to keep passing ack's back and forth. 1318 * (someone sent us dataless, boring frame)1319 */1320 if (!th->ack)
1321 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1322 kfree_skb(skb, FREE_READ);
1323 return(0);
1324 }1325
1326 /*1327 * We no longer have anyone receiving data on this connection.1328 */1329
1330 #ifndef TCP_DONT_RST_SHUTDOWN
1331
1332 if(sk->shutdown & RCV_SHUTDOWN)
1333 {1334 /*1335 * FIXME: BSD has some magic to avoid sending resets to1336 * broken 4.2 BSD keepalives. Much to my surprise a few non1337 * BSD stacks still have broken keepalives so we want to1338 * cope with it.1339 */1340
1341 if(skb->len) /* We don't care if it's just an ack or1342 a keepalive/window probe */1343 {1344 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */1345
1346 /* Do this the way 4.4BSD treats it. Not what I'd1347 regard as the meaning of the spec but it's what BSD1348 does and clearly they know everything 8) */1349
1350 /*1351 * This is valid because of two things1352 *1353 * a) The way tcp_data behaves at the bottom.1354 * b) A fin takes effect when read not when received.1355 */1356
1357 shut_seq = sk->acked_seq+1; /* Last byte */1358
1359 if(after(new_seq,shut_seq))
1360 {1361 if(sk->debug)
1362 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1363 sk, new_seq, shut_seq, sk->blog);
1364 if(sk->dead)
1365 {1366 sk->acked_seq = new_seq + th->fin;
1367 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1368 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1369 tcp_statistics.TcpEstabResets++;
1370 sk->err = EPIPE;
1371 sk->error_report(sk);
1372 sk->shutdown = SHUTDOWN_MASK;
1373 tcp_set_state(sk,TCP_CLOSE);
1374 kfree_skb(skb, FREE_READ);
1375 return 0;
1376 }1377 }1378 }1379 }1380
1381 #endif1382
1383 tcp_queue(skb, sk, th, saddr);
1384
1385 /*1386 * If we've missed a packet, send an ack.1387 * Also start a timer to send another.1388 */1389
1390 if (!skb->acked)
1391 {1392 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1393 sk->ack_backlog++;
1394 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1395 }1396
1397 /*1398 * Now tell the user we may have some data. 1399 */1400
1401 if (!sk->dead)
1402 {1403 if(sk->debug)
1404 printk("Data wakeup.\n");
1405 sk->data_ready(sk,0);
1406 }1407 return(0);
1408 }1409
1410
1411 /*1412 * This routine is only called when we have urgent data1413 * signalled. Its the 'slow' part of tcp_urg. It could be1414 * moved inline now as tcp_urg is only called from one1415 * place. We handle URGent data wrong. We have to - as1416 * BSD still doesn't use the correction from RFC961.1417 */1418
1419 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */1420 {1421 u32ptr = ntohs(th->urg_ptr);
1422
1423 if (ptr)
1424 ptr--;
1425 ptr += ntohl(th->seq);
1426
1427 /* ignore urgent data that we've already seen and read */1428 if (after(sk->copied_seq, ptr))
1429 return;
1430
1431 /* do we already have a newer (or duplicate) urgent pointer? */1432 if (sk->urg_data && !after(ptr, sk->urg_seq))
1433 return;
1434
1435 /* tell the world about our new urgent pointer */1436 if (sk->proc != 0) {1437 if (sk->proc > 0) {1438 kill_proc(sk->proc, SIGURG, 1);
1439 }else{1440 kill_pg(-sk->proc, SIGURG, 1);
1441 }1442 }1443 sk->urg_data = URG_NOTYET;
1444 sk->urg_seq = ptr;
1445 }1446
1447 /*1448 * This is the 'fast' part of urgent handling.1449 */1450
1451 staticinlinevoidtcp_urg(structsock *sk, structtcphdr *th, unsignedlonglen)
/* */1452 {1453 /*1454 * Check if we get a new urgent pointer - normally not 1455 */1456
1457 if (th->urg)
1458 tcp_check_urg(sk,th);
1459
1460 /*1461 * Do we wait for any urgent data? - normally not1462 */1463
1464 if (sk->urg_data == URG_NOTYET) {1465 u32ptr;
1466
1467 /*1468 * Is the urgent pointer pointing into this packet? 1469 */1470 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1471 if (ptr < len) {1472 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
1473 if (!sk->dead)
1474 sk->data_ready(sk,0);
1475 }1476 }1477 }1478
1479 /*1480 * This should be a bit smarter and remove partially1481 * overlapping stuff too, but this should be good1482 * enough for any even remotely normal case (and the1483 * worst that can happen is that we have a few1484 * unnecessary packets in the receive queue).1485 *1486 * This function is never called with an empty list..1487 */1488 staticinlinevoidtcp_remove_dups(structsk_buff_head * list)
/* */1489 {1490 structsk_buff * next = list->next;
1491
1492 for (;;) {1493 structsk_buff * skb = next;
1494 next = next->next;
1495 if (next == (structsk_buff *) list)
1496 break;
1497 if (before(next->end_seq, skb->end_seq)) {1498 __skb_unlink(next, list);
1499 kfree_skb(next, FREE_READ);
1500 next = skb;
1501 continue;
1502 }1503 if (next->seq != skb->seq)
1504 continue;
1505 __skb_unlink(skb, list);
1506 kfree_skb(skb, FREE_READ);
1507 }1508 }1509
1510 /*1511 * Throw out all unnecessary packets: we've gone over the1512 * receive queue limit. This shouldn't happen in a normal1513 * TCP connection, but we might have gotten duplicates etc.1514 */1515 staticvoidprune_queue(structsk_buff_head * list)
/* */1516 {1517 for (;;) {1518 structsk_buff * skb = list->prev;
1519
1520 /* gone through it all? */1521 if (skb == (structsk_buff *) list)
1522 break;
1523 if (!skb->acked) {1524 __skb_unlink(skb, list);
1525 kfree_skb(skb, FREE_READ);
1526 continue;
1527 }1528 tcp_remove_dups(list);
1529 break;
1530 }1531 }1532
1533 /*1534 * A TCP packet has arrived.1535 * skb->h.raw is the TCP header.1536 */1537
1538 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */1539 __u32daddr, unsignedshortlen,
1540 __u32saddr, intredo, structinet_protocol * protocol)
1541 {1542 structtcphdr *th;
1543 structsock *sk;
1544 intsyn_ok=0;
1545
1546 /*1547 * "redo" is 1 if we have already seen this skb but couldn't1548 * use it at that time (the socket was locked). In that case1549 * we have already done a lot of the work (looked up the socket1550 * etc).1551 */1552 th = skb->h.th;
1553 sk = skb->sk;
1554 if (!redo) {1555 tcp_statistics.TcpInSegs++;
1556 if (skb->pkt_type!=PACKET_HOST)
1557 gotodiscard_it;
1558
1559 /*1560 * Pull up the IP header.1561 */1562
1563 skb_pull(skb, skb->h.raw-skb->data);
1564
1565 /*1566 * Try to use the device checksum if provided.1567 */1568 switch (skb->ip_summed)
1569 {1570 caseCHECKSUM_NONE:
1571 skb->csum = csum_partial((char *)th, len, 0);
1572 caseCHECKSUM_HW:
1573 if (tcp_check(th, len, saddr, daddr, skb->csum))
1574 gotodiscard_it;
1575 default:
1576 /* CHECKSUM_UNNECESSARY */1577 }1578 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1579 if (!sk)
1580 gotono_tcp_socket;
1581 skb->sk = sk;
1582 skb->seq = ntohl(th->seq);
1583 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1584 skb->ack_seq = ntohl(th->ack_seq);
1585
1586 skb->acked = 0;
1587 skb->used = 0;
1588 skb->free = 1;
1589 skb->saddr = daddr;
1590 skb->daddr = saddr;
1591
1592 /*1593 * We may need to add it to the backlog here. 1594 */1595 if (sk->users)
1596 {1597 __skb_queue_tail(&sk->back_log, skb);
1598 return(0);
1599 }1600 }1601
1602 /*1603 * If this socket has got a reset it's to all intents and purposes 1604 * really dead. Count closed sockets as dead.1605 *1606 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD1607 * simply drops data. This seems incorrect as a 'closed' TCP doesn't1608 * exist so should cause resets as if the port was unreachable.1609 */1610
1611 if (sk->zapped || sk->state==TCP_CLOSE)
1612 gotono_tcp_socket;
1613
1614 if (!sk->prot)
1615 {1616 printk("IMPOSSIBLE 3\n");
1617 return(0);
1618 }1619
1620
1621 /*1622 * Charge the memory to the socket. 1623 */1624
1625 skb->sk=sk;
1626 atomic_add(skb->truesize, &sk->rmem_alloc);
1627
1628 /*1629 * We should now do header prediction.1630 */1631
1632 /*1633 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We1634 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug1635 * compatibility. We also set up variables more thoroughly [Karn notes in the1636 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].1637 */1638
1639 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */1640 {1641
1642 /*1643 * Now deal with unusual cases.1644 */1645
1646 if(sk->state==TCP_LISTEN)
1647 {1648 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */1649 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1650
1651 /*1652 * We don't care for RST, and non SYN are absorbed (old segments)1653 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the1654 * netmask on a running connection it can go broadcast. Even Sun's have1655 * this problem so I'm ignoring it 1656 */1657
1658 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1659 {1660 kfree_skb(skb, FREE_READ);
1661 return 0;
1662 }1663
1664 /* 1665 * Guess we need to make a new socket up 1666 */1667
1668 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1669
1670 /*1671 * Now we have several options: In theory there is nothing else1672 * in the frame. KA9Q has an option to send data with the syn,1673 * BSD accepts data with the syn up to the [to be] advertised window1674 * and Solaris 2.1 gives you a protocol error. For now we just ignore1675 * it, that fits the spec precisely and avoids incompatibilities. It1676 * would be nice in future to drop through and process the data.1677 *1678 * Now TTCP is starting to use we ought to queue this data.1679 */1680
1681 return 0;
1682 }1683
1684 /* 1685 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN1686 * then its a new connection1687 */1688
1689 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1690 {1691 kfree_skb(skb, FREE_READ);
1692 return 0;
1693 }1694
1695 /*1696 * SYN sent means we have to look for a suitable ack and either reset1697 * for bad matches or go to connected. The SYN_SENT case is unusual and should1698 * not be in line code. [AC]1699 */1700
1701 if(sk->state==TCP_SYN_SENT)
1702 {1703 /* Crossed SYN or previous junk segment */1704 if(th->ack)
1705 {1706 /* We got an ack, but it's not a good ack */1707 if(!tcp_ack(sk,th,skb->ack_seq,len))
1708 {1709 /* Reset the ack - its an ack from a 1710 different connection [ th->rst is checked in tcp_send_reset()] */1711 tcp_statistics.TcpAttemptFails++;
1712 tcp_send_reset(daddr, saddr, th,
1713 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1714 kfree_skb(skb, FREE_READ);
1715 return(0);
1716 }1717 if(th->rst)
1718 returntcp_reset(sk,skb);
1719 if(!th->syn)
1720 {1721 /* A valid ack from a different connection1722 start. Shouldn't happen but cover it */1723 tcp_statistics.TcpAttemptFails++;
1724 tcp_send_reset(daddr, saddr, th,
1725 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1726 kfree_skb(skb, FREE_READ);
1727 return 0;
1728 }1729 /*1730 * Ok.. it's good. Set up sequence numbers and1731 * move to established.1732 */1733 syn_ok=1; /* Don't reset this connection for the syn */1734 sk->acked_seq = skb->seq+1;
1735 sk->lastwin_seq = skb->seq+1;
1736 sk->fin_seq = skb->seq;
1737 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1738 tcp_set_state(sk, TCP_ESTABLISHED);
1739 tcp_options(sk,th);
1740 sk->dummy_th.dest=th->source;
1741 sk->copied_seq = sk->acked_seq;
1742 if(!sk->dead)
1743 {1744 sk->state_change(sk);
1745 sock_wake_async(sk->socket, 0);
1746 }1747 if(sk->max_window==0)
1748 {1749 sk->max_window = 32;
1750 sk->mss = min(sk->max_window, sk->mtu);
1751 }1752 }1753 else1754 {1755 /* See if SYN's cross. Drop if boring */1756 if(th->syn && !th->rst)
1757 {1758 /* Crossed SYN's are fine - but talking to1759 yourself is right out... */1760 if(sk->saddr==saddr && sk->daddr==daddr &&
1761 sk->dummy_th.source==th->source &&
1762 sk->dummy_th.dest==th->dest)
1763 {1764 tcp_statistics.TcpAttemptFails++;
1765 returntcp_reset(sk,skb);
1766 }1767 tcp_set_state(sk,TCP_SYN_RECV);
1768
1769 /*1770 * FIXME:1771 * Must send SYN|ACK here1772 */1773 }1774 /* Discard junk segment */1775 kfree_skb(skb, FREE_READ);
1776 return 0;
1777 }1778 /*1779 * SYN_RECV with data maybe.. drop through1780 */1781 gotorfc_step6;
1782 }1783
1784 /*1785 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is1786 * a more complex suggestion for fixing these reuse issues in RFC16441787 * but not yet ready for general use. Also see RFC1379.1788 *1789 * Note the funny way we go back to the top of this function for1790 * this case ("goto try_next_socket"). That also takes care of1791 * checking "sk->users" for the new socket as well as doing all1792 * the normal tests on the packet.1793 */1794
1795 #defineBSD_TIME_WAIT1796 #ifdefBSD_TIME_WAIT1797 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1798 after(skb->seq, sk->acked_seq) && !th->rst)
1799 {1800 u32seq = sk->write_seq;
1801 if(sk->debug)
1802 printk("Doing a BSD time wait\n");
1803 tcp_statistics.TcpEstabResets++;
1804 atomic_sub(skb->truesize, &sk->rmem_alloc);
1805 skb->sk = NULL;
1806 sk->err=ECONNRESET;
1807 tcp_set_state(sk, TCP_CLOSE);
1808 sk->shutdown = SHUTDOWN_MASK;
1809 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1810 /* this is not really correct: we should check sk->users */1811 if (sk && sk->state==TCP_LISTEN)
1812 {1813 skb->sk = sk;
1814 atomic_add(skb->truesize, &sk->rmem_alloc);
1815 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1816 return 0;
1817 }1818 kfree_skb(skb, FREE_READ);
1819 return 0;
1820 }1821 #endif1822 }1823
1824 /*1825 * We are now in normal data flow (see the step list in the RFC)1826 * Note most of these are inline now. I'll inline the lot when1827 * I have time to test it hard and look at what gcc outputs 1828 */1829
1830 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1831 {1832 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1833 kfree_skb(skb, FREE_READ);
1834 return 0;
1835 }1836
1837 if(th->rst)
1838 returntcp_reset(sk,skb);
1839
1840 /*1841 * !syn_ok is effectively the state test in RFC793.1842 */1843
1844 if(th->syn && !syn_ok)
1845 {1846 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1847 returntcp_reset(sk,skb);
1848 }1849
1850 tcp_delack_estimator(sk);
1851
1852 /*1853 * Process the ACK1854 */1855
1856
1857 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1858 {1859 /*1860 * Our three way handshake failed.1861 */1862
1863 if(sk->state==TCP_SYN_RECV)
1864 {1865 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1866 }1867 kfree_skb(skb, FREE_READ);
1868 return 0;
1869 }1870
1871 rfc_step6: /* I'll clean this up later */1872
1873 /*1874 * If the accepted buffer put us over our queue size we1875 * now drop it (we must process the ack first to avoid1876 * deadlock cases).1877 */1878
1879 /*1880 * Process urgent data1881 */1882
1883 tcp_urg(sk, th, len);
1884
1885 /*1886 * Process the encapsulated data1887 */1888
1889 if(tcp_data(skb,sk, saddr, len))
1890 kfree_skb(skb, FREE_READ);
1891
1892 /*1893 * If our receive queue has grown past its limits,1894 * try to prune away duplicates etc..1895 */1896 if (sk->rmem_alloc > sk->rcvbuf)
1897 prune_queue(&sk->receive_queue);
1898
1899 /*1900 * And done1901 */1902
1903 return 0;
1904
1905 no_tcp_socket:
1906 /*1907 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)1908 */1909 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1910
1911 discard_it:
1912 /*1913 * Discard frame1914 */1915 skb->sk = NULL;
1916 kfree_skb(skb, FREE_READ);
1917 return 0;
1918 }