1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp_input.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * FIXES 23 * Pedro Roque : Double ACK bug 24 */ 25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 /* 30 * Policy code extracted so its now seperate 31 */ 32
33 /* 34 * Called each time to estimate the delayed ack timeout. This is 35 * how it should be done so a fast link isnt impacted by ack delay. 36 */ 37
38 extern__inline__voidtcp_delack_estimator(structsock *sk)
/* */ 39 { 40 /* 41 * Delayed ACK time estimator. 42 */ 43
44 if (sk->lrcvtime == 0)
45 { 46 sk->lrcvtime = jiffies;
47 sk->ato = HZ/3;
48 } 49 else 50 { 51 intm;
52
53 m = jiffies - sk->lrcvtime;
54
55 sk->lrcvtime = jiffies;
56
57 if (m <= 0)
58 m = 1;
59
60 if (m > (sk->rtt >> 3))
61 { 62 sk->ato = sk->rtt >> 3;
63 /* 64 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); 65 */ 66 } 67 else 68 { 69 sk->ato = (sk->ato >> 1) + m;
70 /* 71 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); 72 */ 73 } 74 } 75 } 76
77 /* 78 * Called on frames that were known _not_ to have been 79 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 80 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. 81 */ 82
83 extern__inline__voidtcp_rtt_estimator(structsock *sk, structsk_buff *oskb)
/* */ 84 { 85 longm;
86 /* 87 * The following amusing code comes from Jacobson's 88 * article in SIGCOMM '88. Note that rtt and mdev 89 * are scaled versions of rtt and mean deviation. 90 * This is designed to be as fast as possible 91 * m stands for "measurement". 92 */ 93
94 m = jiffies - oskb->when; /* RTT */ 95 if(m<=0)
96 m=1; /* IS THIS RIGHT FOR <0 ??? */ 97 m -= (sk->rtt >> 3); /* m is now error in rtt est */ 98 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ 99 if (m < 0)
100 m = -m; /* m is now abs(error) */ 101 m -= (sk->mdev >> 2); /* similar update on mdev */ 102 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 103
104 /* 105 * Now update timeout. Note that this removes any backoff. 106 */ 107
108 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
109 if (sk->rto > 120*HZ)
110 sk->rto = 120*HZ;
111 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ 112 sk->rto = HZ/5;
113 sk->backoff = 0;
114 } 115
116 /* 117 * Cached last hit socket 118 */ 119
120 staticvolatileunsignedlongth_cache_saddr, th_cache_daddr;
121 staticvolatileunsignedshortth_cache_dport, th_cache_sport;
122 staticvolatilestructsock *th_cache_sk;
123
124 voidtcp_cache_zap(void)
/* */ 125 { 126 th_cache_sk=NULL;
127 } 128
129 /* 130 * Find the socket, using the last hit cache if applicable. The cache is not quite 131 * right... 132 */ 133
134 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */ 135 { 136 structsock * sk;
137
138 sk = (structsock *) th_cache_sk;
139 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
140 sport != th_cache_sport || dport != th_cache_dport) { 141 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
142 if (sk) { 143 th_cache_saddr=saddr;
144 th_cache_daddr=daddr;
145 th_cache_dport=dport;
146 th_cache_sport=sport;
147 th_cache_sk=sk;
148 } 149 } 150 returnsk;
151 } 152
153 /* 154 * React to a out-of-window TCP sequence number in an incoming packet 155 */ 156
157 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */ 158 structoptions *opt, unsignedlongsaddr, structdevice *dev)
159 { 160 if (th->rst)
161 return;
162
163 /* 164 * Send a reset if we get something not ours and we are 165 * unsynchronized. Note: We don't do anything to our end. We 166 * are just killing the bogus remote connection then we will 167 * connect again and it will work (with luck). 168 */ 169
170 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
171 { 172 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
173 return;
174 } 175
176 /* 177 * 4.3reno machines look for these kind of acks so they can do fast 178 * recovery. Three identical 'old' acks lets it know that one frame has 179 * been lost and should be resent. Because this is before the whole window 180 * of data has timed out it can take one lost frame per window without 181 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2] 182 * 183 * We also should be spotting triple bad sequences. 184 */ 185 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
186 return;
187 } 188
189 /* 190 * This functions checks to see if the tcp header is actually acceptable. 191 */ 192
193 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */ 194 { 195 u32end_window = sk->acked_seq + sk->window;
196 return/* if start is at end of window, end must be too (zero window) */ 197 (seq == end_window && seq == end_seq) ||
198 /* if start is before end of window, check for interest */ 199 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
200 } 201
202 /* 203 * When we get a reset we do this. This probably is a tcp_output routine 204 * really. 205 */ 206
207 staticinttcp_reset(structsock *sk, structsk_buff *skb)
/* */ 208 { 209 sk->zapped = 1;
210 /* 211 * We want the right error as BSD sees it (and indeed as we do). 212 */ 213 sk->err = ECONNRESET;
214 if (sk->state == TCP_SYN_SENT)
215 sk->err = ECONNREFUSED;
216 if (sk->state == TCP_CLOSE_WAIT)
217 sk->err = EPIPE;
218 #ifdef CONFIG_TCP_RFC1337
219 /* 220 * Time wait assassination protection [RFC1337] 221 * 222 * This is a good idea, but causes more sockets to take time to close. 223 * 224 * Ian Heavens has since shown this is an inadequate fix for the protocol 225 * bug in question. 226 */ 227 if(sk->state!=TCP_TIME_WAIT)
228 { 229 tcp_set_state(sk,TCP_CLOSE);
230 sk->shutdown = SHUTDOWN_MASK;
231 } 232 #else 233 tcp_set_state(sk,TCP_CLOSE);
234 sk->shutdown = SHUTDOWN_MASK;
235 #endif 236 if (!sk->dead)
237 sk->state_change(sk);
238 kfree_skb(skb, FREE_READ);
239 return(0);
240 } 241
242
243 /* 244 * Look for tcp options. Parses everything but only knows about MSS. 245 * This routine is always called with the packet containing the SYN. 246 * However it may also be called with the ack to the SYN. So you 247 * can't assume this is always the SYN. It's always called after 248 * we have set up sk->mtu to our own MTU. 249 * 250 * We need at minimum to add PAWS support here. Possibly large windows 251 * as Linux gets deployed on 100Mb/sec networks. 252 */ 253
254 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */ 255 { 256 unsignedchar *ptr;
257 intlength=(th->doff*4)-sizeof(structtcphdr);
258 intmss_seen = 0;
259
260 ptr = (unsignedchar *)(th + 1);
261
262 while(length>0)
263 { 264 intopcode=*ptr++;
265 intopsize=*ptr++;
266 switch(opcode)
267 { 268 caseTCPOPT_EOL:
269 return;
270 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 271 length--;
272 ptr--; /* the opsize=*ptr++ above was a mistake */ 273 continue;
274
275 default:
276 if(opsize<=2) /* Avoid silly options looping forever */ 277 return;
278 switch(opcode)
279 { 280 caseTCPOPT_MSS:
281 if(opsize==4 && th->syn)
282 { 283 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
284 mss_seen = 1;
285 } 286 break;
287 /* Add other options here as people feel the urge to implement stuff like large windows */ 288 } 289 ptr+=opsize-2;
290 length-=opsize;
291 } 292 } 293 if (th->syn)
294 { 295 if (! mss_seen)
296 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ 297 } 298 #ifdefCONFIG_INET_PCTCP 299 sk->mss = min(sk->max_window >> 1, sk->mtu);
300 #else 301 sk->mss = min(sk->max_window, sk->mtu);
302 sk->max_unacked = 2 * sk->mss;
303 #endif 304 } 305
306
307 /* 308 * This routine handles a connection request. 309 * It should make sure we haven't already responded. 310 * Because of the way BSD works, we have to send a syn/ack now. 311 * This also means it will be harder to close a socket which is 312 * listening. 313 */ 314
315 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */ 316 u32daddr, u32saddr, structoptions *opt, structdevice *dev, u32seq)
317 { 318 structsock *newsk;
319 structtcphdr *th;
320 structrtable *rt;
321
322 th = skb->h.th;
323
324 /* If the socket is dead, don't accept the connection. */ 325 if (!sk->dead)
326 { 327 sk->data_ready(sk,0);
328 } 329 else 330 { 331 if(sk->debug)
332 printk("Reset on %p: Connect on dead socket.\n",sk);
333 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
334 tcp_statistics.TcpAttemptFails++;
335 kfree_skb(skb, FREE_READ);
336 return;
337 } 338
339 /* 340 * Make sure we can accept more. This will prevent a 341 * flurry of syns from eating up all our memory. 342 * 343 * BSD does some funnies here and allows 3/2 times the 344 * set backlog as a fudge factor. Thats just too gross. 345 */ 346
347 if (sk->ack_backlog >= sk->max_ack_backlog)
348 { 349 tcp_statistics.TcpAttemptFails++;
350 kfree_skb(skb, FREE_READ);
351 return;
352 } 353
354 /* 355 * We need to build a new sock struct. 356 * It is sort of bad to have a socket without an inode attached 357 * to it, but the wake_up's will just wake up the listening socket, 358 * and if the listening socket is destroyed before this is taken 359 * off of the queue, this will take care of it. 360 */ 361
362 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
363 if (newsk == NULL)
364 { 365 /* just ignore the syn. It will get retransmitted. */ 366 tcp_statistics.TcpAttemptFails++;
367 kfree_skb(skb, FREE_READ);
368 return;
369 } 370
371 memcpy(newsk, sk, sizeof(*newsk));
372 newsk->opt = NULL;
373 newsk->ip_route_cache = NULL;
374 if (opt && opt->optlen)
375 { 376 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
377 if (!sk->opt)
378 { 379 kfree_s(newsk, sizeof(structsock));
380 tcp_statistics.TcpAttemptFails++;
381 kfree_skb(skb, FREE_READ);
382 return;
383 } 384 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
385 { 386 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
387 kfree_s(newsk, sizeof(structsock));
388 tcp_statistics.TcpAttemptFails++;
389 kfree_skb(skb, FREE_READ);
390 return;
391 } 392 } 393 skb_queue_head_init(&newsk->write_queue);
394 skb_queue_head_init(&newsk->receive_queue);
395 newsk->send_head = NULL;
396 newsk->send_tail = NULL;
397 skb_queue_head_init(&newsk->back_log);
398 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ 399 newsk->ato = HZ/3;
400 newsk->rto = TCP_TIMEOUT_INIT;
401 newsk->mdev = 0;
402 newsk->max_window = 0;
403 newsk->cong_window = 1;
404 newsk->cong_count = 0;
405 newsk->ssthresh = 0;
406 newsk->backoff = 0;
407 newsk->blog = 0;
408 newsk->intr = 0;
409 newsk->proc = 0;
410 newsk->done = 0;
411 newsk->partial = NULL;
412 newsk->pair = NULL;
413 newsk->wmem_alloc = 0;
414 newsk->rmem_alloc = 0;
415 newsk->localroute = sk->localroute;
416
417 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
418
419 newsk->err = 0;
420 newsk->shutdown = 0;
421 newsk->ack_backlog = 0;
422 newsk->acked_seq = skb->seq+1;
423 newsk->lastwin_seq = skb->seq+1;
424 newsk->delay_acks = 1;
425 newsk->copied_seq = skb->seq+1;
426 newsk->fin_seq = skb->seq;
427 newsk->state = TCP_SYN_RECV;
428 newsk->timeout = 0;
429 newsk->ip_xmit_timeout = 0;
430 newsk->write_seq = seq;
431 newsk->window_seq = newsk->write_seq;
432 newsk->rcv_ack_seq = newsk->write_seq;
433 newsk->urg_data = 0;
434 newsk->retransmits = 0;
435 newsk->linger=0;
436 newsk->destroy = 0;
437 init_timer(&newsk->timer);
438 newsk->timer.data = (unsignedlong)newsk;
439 newsk->timer.function = &net_timer;
440 init_timer(&newsk->retransmit_timer);
441 newsk->retransmit_timer.data = (unsignedlong)newsk;
442 newsk->retransmit_timer.function=&tcp_retransmit_timer;
443 newsk->dummy_th.source = skb->h.th->dest;
444 newsk->dummy_th.dest = skb->h.th->source;
445
446 /* 447 * Swap these two, they are from our point of view. 448 */ 449
450 newsk->daddr = saddr;
451 newsk->saddr = daddr;
452 newsk->rcv_saddr = daddr;
453
454 put_sock(newsk->num,newsk);
455 newsk->acked_seq = skb->seq + 1;
456 newsk->copied_seq = skb->seq + 1;
457 newsk->socket = NULL;
458
459 /* 460 * Grab the ttl and tos values and use them 461 */ 462
463 newsk->ip_ttl=sk->ip_ttl;
464 newsk->ip_tos=skb->ip_hdr->tos;
465
466 /* 467 * Use 512 or whatever user asked for 468 */ 469
470 /* 471 * Note use of sk->user_mss, since user has no direct access to newsk 472 */ 473
474 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
475 newsk->ip_route_cache = rt;
476
477 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
478 newsk->window_clamp = rt->rt_window;
479 else 480 newsk->window_clamp = 0;
481
482 if (sk->user_mss)
483 newsk->mtu = sk->user_mss;
484 elseif (rt)
485 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
486 else 487 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
488
489 /* 490 * But not bigger than device MTU 491 */ 492
493 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
494
495 #ifdefCONFIG_SKIP 496
497 /* 498 * SKIP devices set their MTU to 65535. This is so they can take packets 499 * unfragmented to security process then fragment. They could lie to the 500 * TCP layer about a suitable MTU, but its easier to let skip sort it out 501 * simply because the final package we want unfragmented is going to be 502 * 503 * [IPHDR][IPSP][Security data][Modified TCP data][Security data] 504 */ 505
506 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ 507 sk->mtu=skip_pick_mtu(sk->mtu,dev);
508 #endif 509 /* 510 * This will min with what arrived in the packet 511 */ 512
513 tcp_options(newsk,skb->h.th);
514
515 tcp_cache_zap();
516 tcp_send_synack(newsk, sk, skb);
517 } 518
519
520 /* 521 * Handle a TCP window that shrunk on us. It shouldn't happen, 522 * but.. 523 * 524 * We may need to move packets from the send queue 525 * to the write queue, if the window has been shrunk on us. 526 * The RFC says you are not allowed to shrink your window 527 * like this, but if the other end does, you must be able 528 * to deal with it. 529 */ 530 voidtcp_window_shrunk(structsock * sk, u32window_seq)
/* */ 531 { 532 structsk_buff *skb;
533 structsk_buff *skb2;
534 structsk_buff *wskb = NULL;
535
536 skb2 = sk->send_head;
537 sk->send_head = NULL;
538 sk->send_tail = NULL;
539
540 /* 541 * This is an artifact of a flawed concept. We want one 542 * queue and a smarter send routine when we send all. 543 */ 544 cli();
545 while (skb2 != NULL)
546 { 547 skb = skb2;
548 skb2 = skb->link3;
549 skb->link3 = NULL;
550 if (after(skb->end_seq, window_seq))
551 { 552 if (sk->packets_out > 0)
553 sk->packets_out--;
554 /* We may need to remove this from the dev send list. */ 555 if (skb->next != NULL)
556 { 557 skb_unlink(skb);
558 } 559 /* Now add it to the write_queue. */ 560 if (wskb == NULL)
561 skb_queue_head(&sk->write_queue,skb);
562 else 563 skb_append(wskb,skb);
564 wskb = skb;
565 } 566 else 567 { 568 if (sk->send_head == NULL)
569 { 570 sk->send_head = skb;
571 sk->send_tail = skb;
572 } 573 else 574 { 575 sk->send_tail->link3 = skb;
576 sk->send_tail = skb;
577 } 578 skb->link3 = NULL;
579 } 580 } 581 sti();
582 } 583
584
585 /* 586 * This routine deals with incoming acks, but not outgoing ones. 587 * 588 * This routine is totally _WRONG_. The list structuring is wrong, 589 * the algorithm is wrong, the code is wrong. 590 */ 591
592 staticinttcp_ack(structsock *sk, structtcphdr *th, u32ack, intlen)
/* */ 593 { 594 intflag = 0;
595 u32window_seq;
596
597 /* 598 * 1 - there was data in packet as well as ack or new data is sent or 599 * in shutdown state 600 * 2 - data from retransmit queue was acked and removed 601 * 4 - window shrunk or data from retransmit queue was acked and removed 602 */ 603
604 if(sk->zapped)
605 return(1); /* Dead, cant ack any more so why bother */ 606
607 /* 608 * We have dropped back to keepalive timeouts. Thus we have 609 * no retransmits pending. 610 */ 611
612 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
613 sk->retransmits = 0;
614
615 /* 616 * If the ack is newer than sent or older than previous acks 617 * then we can probably ignore it. 618 */ 619
620 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
621 gotouninteresting_ack;
622
623 /* 624 * If there is data set flag 1 625 */ 626
627 if (len != th->doff*4)
628 flag |= 1;
629
630 /* 631 * Have we discovered a larger window 632 */ 633 window_seq = ntohs(th->window);
634 if (window_seq > sk->max_window)
635 { 636 sk->max_window = window_seq;
637 #ifdefCONFIG_INET_PCTCP 638 /* Hack because we don't send partial packets to non SWS 639 handling hosts */ 640 sk->mss = min(window_seq>>1, sk->mtu);
641 #else 642 sk->mss = min(window_seq, sk->mtu);
643 #endif 644 } 645 window_seq += ack;
646
647 /* 648 * See if our window has been shrunk. 649 */ 650 if (after(sk->window_seq, window_seq)) { 651 flag |= 4;
652 tcp_window_shrunk(sk, window_seq);
653 } 654
655 /* 656 * Update the right hand window edge of the host 657 */ 658 sk->window_seq = window_seq;
659
660 /* 661 * Pipe has emptied 662 */ 663 if (sk->send_tail == NULL || sk->send_head == NULL)
664 { 665 sk->send_head = NULL;
666 sk->send_tail = NULL;
667 sk->packets_out= 0;
668 } 669
670 /* 671 * We don't want too many packets out there. 672 */ 673
674 if (sk->ip_xmit_timeout == TIME_WRITE &&
675 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
676 { 677
678 /* 679 * This is Jacobson's slow start and congestion avoidance. 680 * SIGCOMM '88, p. 328. Because we keep cong_window in integral 681 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 682 * counter and increment it once every cwnd times. It's possible 683 * that this should be done only if sk->retransmits == 0. I'm 684 * interpreting "new data is acked" as including data that has 685 * been retransmitted but is just now being acked. 686 */ 687 if (sk->cong_window < sk->ssthresh)
688 /* 689 * In "safe" area, increase 690 */ 691 sk->cong_window++;
692 else 693 { 694 /* 695 * In dangerous area, increase slowly. In theory this is 696 * sk->cong_window += 1 / sk->cong_window 697 */ 698 if (sk->cong_count >= sk->cong_window)
699 { 700 sk->cong_window++;
701 sk->cong_count = 0;
702 } 703 else 704 sk->cong_count++;
705 } 706 } 707
708 /* 709 * Remember the highest ack received. 710 */ 711
712 sk->rcv_ack_seq = ack;
713
714 /* 715 * We passed data and got it acked, remove any soft error 716 * log. Something worked... 717 */ 718
719 sk->err_soft = 0;
720
721 /* 722 * If this ack opens up a zero window, clear backoff. It was 723 * being used to time the probes, and is probably far higher than 724 * it needs to be for normal retransmission. 725 */ 726
727 if (sk->ip_xmit_timeout == TIME_PROBE0)
728 { 729 sk->retransmits = 0; /* Our probe was answered */ 730
731 /* 732 * Was it a usable window open ? 733 */ 734
735 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */ 736 ! before (sk->window_seq, sk->write_queue.next->end_seq))
737 { 738 sk->backoff = 0;
739
740 /* 741 * Recompute rto from rtt. this eliminates any backoff. 742 */ 743
744 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
745 if (sk->rto > 120*HZ)
746 sk->rto = 120*HZ;
747 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about 748 .2 of a second because of BSD delayed acks - on a 100Mb/sec link 749 .2 of a second is going to need huge windows (SIGH) */ 750 sk->rto = HZ/5;
751 } 752 } 753
754 /* 755 * See if we can take anything off of the retransmit queue. 756 */ 757
758 while(sk->send_head != NULL)
759 { 760 /* Check for a bug. */ 761 if (sk->send_head->link3 &&
762 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
763 printk("INET: tcp.c: *** bug send_list out of order.\n");
764
765 /* 766 * If our packet is before the ack sequence we can 767 * discard it as it's confirmed to have arrived the other end. 768 */ 769
770 if (before(sk->send_head->end_seq, ack+1))
771 { 772 structsk_buff *oskb;
773 if (sk->retransmits)
774 { 775 /* 776 * We were retransmitting. don't count this in RTT est 777 */ 778 flag |= 2;
779
780 /* 781 * even though we've gotten an ack, we're still 782 * retransmitting as long as we're sending from 783 * the retransmit queue. Keeping retransmits non-zero 784 * prevents us from getting new data interspersed with 785 * retransmissions. 786 */ 787
788 if (sk->send_head->link3) /* Any more queued retransmits? */ 789 sk->retransmits = 1;
790 else 791 sk->retransmits = 0;
792 } 793 /* 794 * Note that we only reset backoff and rto in the 795 * rtt recomputation code. And that doesn't happen 796 * if there were retransmissions in effect. So the 797 * first new packet after the retransmissions is 798 * sent with the backoff still in effect. Not until 799 * we get an ack from a non-retransmitted packet do 800 * we reset the backoff and rto. This allows us to deal 801 * with a situation where the network delay has increased 802 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) 803 */ 804
805 /* 806 * We have one less packet out there. 807 */ 808
809 if (sk->packets_out > 0)
810 sk->packets_out --;
811
812 oskb = sk->send_head;
813
814 if (!(flag&2)) /* Not retransmitting */ 815 tcp_rtt_estimator(sk,oskb);
816 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 817 In this case as we just set it up */ 818 cli();
819 oskb = sk->send_head;
820 IS_SKB(oskb);
821 sk->send_head = oskb->link3;
822 if (sk->send_head == NULL)
823 { 824 sk->send_tail = NULL;
825 } 826
827 /* 828 * We may need to remove this from the dev send list. 829 */ 830
831 if (oskb->next)
832 skb_unlink(oskb);
833 sti();
834 kfree_skb(oskb, FREE_WRITE); /* write. */ 835 if (!sk->dead)
836 sk->write_space(sk);
837 } 838 else 839 { 840 break;
841 } 842 } 843
844 /* 845 * XXX someone ought to look at this too.. at the moment, if skb_peek() 846 * returns non-NULL, we complete ignore the timer stuff in the else 847 * clause. We ought to organize the code so that else clause can 848 * (should) be executed regardless, possibly moving the PROBE timer 849 * reset over. The skb_peek() thing should only move stuff to the 850 * write queue, NOT also manage the timer functions. 851 */ 852
853 /* 854 * Maybe we can take some stuff off of the write queue, 855 * and put it onto the xmit queue. 856 */ 857 if (skb_peek(&sk->write_queue) != NULL)
858 { 859 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
860 (sk->retransmits == 0 ||
861 sk->ip_xmit_timeout != TIME_WRITE ||
862 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
863 && sk->packets_out < sk->cong_window)
864 { 865 /* 866 * Add more data to the send queue. 867 */ 868 flag |= 1;
869 tcp_write_xmit(sk);
870 } 871 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
872 sk->send_head == NULL &&
873 sk->ack_backlog == 0 &&
874 sk->state != TCP_TIME_WAIT)
875 { 876 /* 877 * Data to queue but no room. 878 */ 879 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
880 } 881 } 882 else 883 { 884 /* 885 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets 886 * from TCP_CLOSE we don't do anything 887 * 888 * from anything else, if there is write data (or fin) pending, 889 * we use a TIME_WRITE timeout, else if keepalive we reset to 890 * a KEEPALIVE timeout, else we delete the timer. 891 * 892 * We do not set flag for nominal write data, otherwise we may 893 * force a state where we start to write itsy bitsy tidbits 894 * of data. 895 */ 896
897 switch(sk->state) { 898 caseTCP_TIME_WAIT:
899 /* 900 * keep us in TIME_WAIT until we stop getting packets, 901 * reset the timeout. 902 */ 903 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
904 break;
905 caseTCP_CLOSE:
906 /* 907 * don't touch the timer. 908 */ 909 break;
910 default:
911 /* 912 * Must check send_head, write_queue, and ack_backlog 913 * to determine which timeout to use. 914 */ 915 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) { 916 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
917 }elseif (sk->keepopen) { 918 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
919 }else{ 920 del_timer(&sk->retransmit_timer);
921 sk->ip_xmit_timeout = 0;
922 } 923 break;
924 } 925 } 926
927 /* 928 * We have nothing queued but space to send. Send any partial 929 * packets immediately (end of Nagle rule application). 930 */ 931
932 if (sk->packets_out == 0 && sk->partial != NULL &&
933 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
934 { 935 flag |= 1;
936 tcp_send_partial(sk);
937 } 938
939 /* 940 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and 941 * we are now waiting for an acknowledge to our FIN. The other end is 942 * already in TIME_WAIT. 943 * 944 * Move to TCP_CLOSE on success. 945 */ 946
947 if (sk->state == TCP_LAST_ACK)
948 { 949 if (!sk->dead)
950 sk->state_change(sk);
951 if(sk->debug)
952 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
953 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
954 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
955 { 956 flag |= 1;
957 sk->shutdown = SHUTDOWN_MASK;
958 tcp_set_state(sk,TCP_CLOSE);
959 return 1;
960 } 961 } 962
963 /* 964 * Incoming ACK to a FIN we sent in the case of our initiating the close. 965 * 966 * Move to FIN_WAIT2 to await a FIN from the other end. Set 967 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. 968 */ 969
970 if (sk->state == TCP_FIN_WAIT1)
971 { 972
973 if (!sk->dead)
974 sk->state_change(sk);
975 if (sk->rcv_ack_seq == sk->write_seq)
976 { 977 flag |= 1;
978 sk->shutdown |= SEND_SHUTDOWN;
979 tcp_set_state(sk, TCP_FIN_WAIT2);
980 } 981 } 982
983 /* 984 * Incoming ACK to a FIN we sent in the case of a simultaneous close. 985 * 986 * Move to TIME_WAIT 987 */ 988
989 if (sk->state == TCP_CLOSING)
990 { 991
992 if (!sk->dead)
993 sk->state_change(sk);
994 if (sk->rcv_ack_seq == sk->write_seq)
995 { 996 flag |= 1;
997 tcp_time_wait(sk);
998 } 999 }1000
1001 /*1002 * Final ack of a three way shake 1003 */1004
1005 if(sk->state==TCP_SYN_RECV)
1006 {1007 tcp_set_state(sk, TCP_ESTABLISHED);
1008 tcp_options(sk,th);
1009 sk->dummy_th.dest=th->source;
1010 sk->copied_seq = sk->acked_seq;
1011 if(!sk->dead)
1012 sk->state_change(sk);
1013 if(sk->max_window==0)
1014 {1015 sk->max_window=32; /* Sanity check */1016 sk->mss=min(sk->max_window,sk->mtu);
1017 }1018 }1019
1020 /*1021 * I make no guarantees about the first clause in the following1022 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under1023 * what conditions "!flag" would be true. However I think the rest1024 * of the conditions would prevent that from causing any1025 * unnecessary retransmission. 1026 * Clearly if the first packet has expired it should be 1027 * retransmitted. The other alternative, "flag&2 && retransmits", is1028 * harder to explain: You have to look carefully at how and when the1029 * timer is set and with what timeout. The most recent transmission always1030 * sets the timer. So in general if the most recent thing has timed1031 * out, everything before it has as well. So we want to go ahead and1032 * retransmit some more. If we didn't explicitly test for this1033 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"1034 * would not be true. If you look at the pattern of timing, you can1035 * show that rto is increased fast enough that the next packet would1036 * almost never be retransmitted immediately. Then you'd end up1037 * waiting for a timeout to send each packet on the retransmission1038 * queue. With my implementation of the Karn sampling algorithm,1039 * the timeout would double each time. The net result is that it would1040 * take a hideous amount of time to recover from a single dropped packet.1041 * It's possible that there should also be a test for TIME_WRITE, but1042 * I think as long as "send_head != NULL" and "retransmit" is on, we've1043 * got to be in real retransmission mode.1044 * Note that tcp_do_retransmit is called with all==1. Setting cong_window1045 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.1046 * As long as no further losses occur, this seems reasonable.1047 */1048
1049 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1050 (((flag&2) && sk->retransmits) ||
1051 (sk->send_head->when + sk->rto < jiffies)))
1052 {1053 if(sk->send_head->when + sk->rto < jiffies)
1054 tcp_retransmit(sk,0);
1055 else1056 {1057 tcp_do_retransmit(sk, 1);
1058 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1059 }1060 }1061
1062 return 1;
1063
1064 uninteresting_ack:
1065 if(sk->debug)
1066 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1067
1068 /*1069 * Keepalive processing.1070 */1071
1072 if (after(ack, sk->sent_seq))
1073 {1074 return 0;
1075 }1076
1077 /*1078 * Restart the keepalive timer.1079 */1080
1081 if (sk->keepopen)
1082 {1083 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1084 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1085 }1086 return 1;
1087 }1088
1089
1090 /*1091 * Process the FIN bit. This now behaves as it is supposed to work1092 * and the FIN takes effect when it is validly part of sequence1093 * space. Not before when we get holes.1094 *1095 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT1096 * (and thence onto LAST-ACK and finally, CLOSE, we never enter1097 * TIME-WAIT)1098 *1099 * If we are in FINWAIT-1, a received FIN indicates simultaneous1100 * close and we go into CLOSING (and later onto TIME-WAIT)1101 *1102 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.1103 *1104 */1105
1106 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */1107 {1108 sk->fin_seq = skb->end_seq;
1109
1110 if (!sk->dead)
1111 {1112 sk->state_change(sk);
1113 sock_wake_async(sk->socket, 1);
1114 }1115
1116 switch(sk->state)
1117 {1118 caseTCP_SYN_RECV:
1119 caseTCP_SYN_SENT:
1120 caseTCP_ESTABLISHED:
1121 /*1122 * move to CLOSE_WAIT, tcp_data() already handled1123 * sending the ack.1124 */1125 tcp_set_state(sk,TCP_CLOSE_WAIT);
1126 if (th->rst)
1127 sk->shutdown = SHUTDOWN_MASK;
1128 break;
1129
1130 caseTCP_CLOSE_WAIT:
1131 caseTCP_CLOSING:
1132 /*1133 * received a retransmission of the FIN, do1134 * nothing.1135 */1136 break;
1137 caseTCP_TIME_WAIT:
1138 /*1139 * received a retransmission of the FIN,1140 * restart the TIME_WAIT timer.1141 */1142 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1143 return(0);
1144 caseTCP_FIN_WAIT1:
1145 /*1146 * This case occurs when a simultaneous close1147 * happens, we must ack the received FIN and1148 * enter the CLOSING state.1149 *1150 * This causes a WRITE timeout, which will either1151 * move on to TIME_WAIT when we timeout, or resend1152 * the FIN properly (maybe we get rid of that annoying1153 * FIN lost hang). The TIME_WRITE code is already correct1154 * for handling this timeout.1155 */1156
1157 if(sk->ip_xmit_timeout != TIME_WRITE)
1158 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1159 tcp_set_state(sk,TCP_CLOSING);
1160 break;
1161 caseTCP_FIN_WAIT2:
1162 /*1163 * received a FIN -- send ACK and enter TIME_WAIT1164 */1165 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1166 sk->shutdown|=SHUTDOWN_MASK;
1167 tcp_set_state(sk,TCP_TIME_WAIT);
1168 break;
1169 caseTCP_CLOSE:
1170 /*1171 * already in CLOSE1172 */1173 break;
1174 default:
1175 tcp_set_state(sk,TCP_LAST_ACK);
1176
1177 /* Start the timers. */1178 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1179 return(0);
1180 }1181
1182 return(0);
1183 }1184
1185 /*1186 * Called for each packet when we find a new ACK endpoint sequence in it1187 */1188 staticinlineu32tcp_queue_ack(structsk_buff * skb, structsock * sk)
/* */1189 {1190 /*1191 * When we ack the fin, we do the FIN 1192 * processing.1193 */1194 skb->acked = 1;
1195 if (skb->h.th->fin)
1196 tcp_fin(skb,sk,skb->h.th);
1197 returnskb->end_seq;
1198 }1199
1200
1201 /*1202 * Add a sk_buff to the TCP receive queue, calculating1203 * the ACK sequence as we go..1204 */1205 staticvoidtcp_queue(structsk_buff * skb, structsock * sk,
/* */1206 structtcphdr *th, unsignedlongsaddr)
1207 {1208 structsk_buff_head * list = &sk->receive_queue;
1209 structsk_buff * next;
1210 u32ack_seq;
1211
1212 /*1213 * Find where the new skb goes.. (This goes backwards,1214 * on the assumption that we get the packets in order)1215 */1216 next = list->prev;
1217 while (next != (structsk_buff *) list) {1218 if (!after(next->seq, skb->seq))
1219 break;
1220 next = next->prev;
1221 }1222 /*1223 * put it after the packet we found (which1224 * may be the list-head, but that's fine).1225 */1226 __skb_append(next, skb, list);
1227 next = skb->next;
1228
1229 /*1230 * Did we get anything new to ack?1231 */1232 ack_seq = sk->acked_seq;
1233 if (!after(skb->seq, ack_seq) && after(skb->end_seq, ack_seq)) {1234 ack_seq = tcp_queue_ack(skb, sk);
1235
1236 /*1237 * Do we have any old packets to ack that the above1238 * made visible? (Go forward from skb)1239 */1240 while (next != (structsk_buff *) list) {1241 if (after(next->seq, ack_seq))
1242 break;
1243 if (after(next->end_seq, ack_seq))
1244 ack_seq = tcp_queue_ack(next, sk);
1245 next = next->next;
1246 }1247
1248 /*1249 * Ok, we found new data, update acked_seq as1250 * necessary (and possibly send the actual1251 * ACK packet).1252 *1253 * rules for delaying an ack:1254 * - delay time <= 0.5 HZ1255 * - we don't have a window update to send1256 * - must send at least every 2 full sized packets1257 */1258 sk->acked_seq = ack_seq;
1259 if (!sk->delay_acks ||
1260 /* sk->ack_backlog >= sk->max_ack_backlog || */1261 sk->bytes_rcv > sk->max_unacked || th->fin ||
1262 sk->ato > HZ/2) {1263 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1264 }1265 else1266 {1267 sk->ack_backlog++;
1268 if(sk->debug)
1269 printk("Ack queued.\n");
1270 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);
1271 }1272 }1273 }1274
1275
1276 /*1277 * This routine handles the data. If there is room in the buffer,1278 * it will be have already been moved into it. If there is no1279 * room, then we will just have to discard the packet.1280 */1281
1282 staticinttcp_data(structsk_buff *skb, structsock *sk,
/* */1283 unsignedlongsaddr, unsignedshortlen)
1284 {1285 structtcphdr *th;
1286 u32new_seq, shut_seq;
1287
1288 th = skb->h.th;
1289 skb_pull(skb,th->doff*4);
1290 skb_trim(skb,len-(th->doff*4));
1291
1292 /*1293 * The bytes in the receive read/assembly queue has increased. Needed for the1294 * low memory discard algorithm 1295 */1296
1297 sk->bytes_rcv += skb->len;
1298
1299 if (skb->len == 0 && !th->fin)
1300 {1301 /* 1302 * Don't want to keep passing ack's back and forth. 1303 * (someone sent us dataless, boring frame)1304 */1305 if (!th->ack)
1306 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1307 kfree_skb(skb, FREE_READ);
1308 return(0);
1309 }1310
1311 /*1312 * We no longer have anyone receiving data on this connection.1313 */1314
1315 #ifndef TCP_DONT_RST_SHUTDOWN
1316
1317 if(sk->shutdown & RCV_SHUTDOWN)
1318 {1319 /*1320 * FIXME: BSD has some magic to avoid sending resets to1321 * broken 4.2 BSD keepalives. Much to my surprise a few non1322 * BSD stacks still have broken keepalives so we want to1323 * cope with it.1324 */1325
1326 if(skb->len) /* We don't care if it's just an ack or1327 a keepalive/window probe */1328 {1329 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */1330
1331 /* Do this the way 4.4BSD treats it. Not what I'd1332 regard as the meaning of the spec but it's what BSD1333 does and clearly they know everything 8) */1334
1335 /*1336 * This is valid because of two things1337 *1338 * a) The way tcp_data behaves at the bottom.1339 * b) A fin takes effect when read not when received.1340 */1341
1342 shut_seq = sk->acked_seq+1; /* Last byte */1343
1344 if(after(new_seq,shut_seq))
1345 {1346 if(sk->debug)
1347 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1348 sk, new_seq, shut_seq, sk->blog);
1349 if(sk->dead)
1350 {1351 sk->acked_seq = new_seq + th->fin;
1352 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1353 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1354 tcp_statistics.TcpEstabResets++;
1355 sk->err = EPIPE;
1356 sk->error_report(sk);
1357 sk->shutdown = SHUTDOWN_MASK;
1358 tcp_set_state(sk,TCP_CLOSE);
1359 kfree_skb(skb, FREE_READ);
1360 return 0;
1361 }1362 }1363 }1364 }1365
1366 #endif1367
1368 tcp_queue(skb, sk, th, saddr);
1369
1370 /*1371 * If we've missed a packet, send an ack.1372 * Also start a timer to send another.1373 */1374
1375 if (!skb->acked)
1376 {1377
1378 /*1379 * This is important. If we don't have much room left,1380 * we need to throw out a few packets so we have a good1381 * window. Note that mtu is used, not mss, because mss is really1382 * for the send side. He could be sending us stuff as large as mtu.1383 */1384
1385 while (sock_rspace(sk) < sk->mtu)
1386 {1387 structsk_buff * skb1 = skb_peek(&sk->receive_queue);
1388 if (skb1 == NULL)
1389 {1390 printk("INET: tcp.c:tcp_data memory leak detected.\n");
1391 break;
1392 }1393
1394 /*1395 * Don't throw out something that has been acked. 1396 */1397
1398 if (skb1->acked)
1399 {1400 break;
1401 }1402
1403 skb_unlink(skb1);
1404 kfree_skb(skb1, FREE_READ);
1405 }1406 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1407 sk->ack_backlog++;
1408 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1409 }1410
1411 /*1412 * Now tell the user we may have some data. 1413 */1414
1415 if (!sk->dead)
1416 {1417 if(sk->debug)
1418 printk("Data wakeup.\n");
1419 sk->data_ready(sk,0);
1420 }1421 return(0);
1422 }1423
1424
1425 /*1426 * This routine is only called when we have urgent data1427 * signalled. Its the 'slow' part of tcp_urg. It could be1428 * moved inline now as tcp_urg is only called from one1429 * place. We handle URGent data wrong. We have to - as1430 * BSD still doesn't use the correction from RFC961.1431 */1432
1433 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */1434 {1435 u32ptr = ntohs(th->urg_ptr);
1436
1437 if (ptr)
1438 ptr--;
1439 ptr += ntohl(th->seq);
1440
1441 /* ignore urgent data that we've already seen and read */1442 if (after(sk->copied_seq, ptr))
1443 return;
1444
1445 /* do we already have a newer (or duplicate) urgent pointer? */1446 if (sk->urg_data && !after(ptr, sk->urg_seq))
1447 return;
1448
1449 /* tell the world about our new urgent pointer */1450 if (sk->proc != 0) {1451 if (sk->proc > 0) {1452 kill_proc(sk->proc, SIGURG, 1);
1453 }else{1454 kill_pg(-sk->proc, SIGURG, 1);
1455 }1456 }1457 sk->urg_data = URG_NOTYET;
1458 sk->urg_seq = ptr;
1459 }1460
1461 /*1462 * This is the 'fast' part of urgent handling.1463 */1464
1465 staticinlinevoidtcp_urg(structsock *sk, structtcphdr *th, unsignedlonglen)
/* */1466 {1467 /*1468 * Check if we get a new urgent pointer - normally not 1469 */1470
1471 if (th->urg)
1472 tcp_check_urg(sk,th);
1473
1474 /*1475 * Do we wait for any urgent data? - normally not1476 */1477
1478 if (sk->urg_data == URG_NOTYET) {1479 u32ptr;
1480
1481 /*1482 * Is the urgent pointer pointing into this packet? 1483 */1484 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1485 if (ptr < len) {1486 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
1487 if (!sk->dead)
1488 sk->data_ready(sk,0);
1489 }1490 }1491 }1492
1493
1494 /*1495 * A TCP packet has arrived.1496 * skb->h.raw is the TCP header.1497 */1498
1499 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */1500 __u32daddr, unsignedshortlen,
1501 __u32saddr, intredo, structinet_protocol * protocol)
1502 {1503 structtcphdr *th;
1504 structsock *sk;
1505 intsyn_ok=0;
1506
1507 /*1508 * "redo" is 1 if we have already seen this skb but couldn't1509 * use it at that time (the socket was locked). In that case1510 * we have already done a lot of the work (looked up the socket1511 * etc).1512 */1513 th = skb->h.th;
1514 sk = skb->sk;
1515 if (!redo) {1516 tcp_statistics.TcpInSegs++;
1517 if (skb->pkt_type!=PACKET_HOST)
1518 gotodiscard_it;
1519
1520 /*1521 * Pull up the IP header.1522 */1523
1524 skb_pull(skb, skb->h.raw-skb->data);
1525
1526 /*1527 * Try to use the device checksum if provided.1528 */1529 switch (skb->ip_summed)
1530 {1531 caseCHECKSUM_NONE:
1532 skb->csum = csum_partial((char *)th, len, 0);
1533 caseCHECKSUM_HW:
1534 if (tcp_check(th, len, saddr, daddr, skb->csum))
1535 gotodiscard_it;
1536 default:
1537 /* CHECKSUM_UNNECESSARY */1538 }1539 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1540 if (!sk)
1541 gotono_tcp_socket;
1542 skb->sk = sk;
1543 skb->seq = ntohl(th->seq);
1544 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1545 skb->ack_seq = ntohl(th->ack_seq);
1546
1547 skb->acked = 0;
1548 skb->used = 0;
1549 skb->free = 1;
1550 skb->saddr = daddr;
1551 skb->daddr = saddr;
1552
1553 /*1554 * We may need to add it to the backlog here. 1555 */1556 if (sk->users)
1557 {1558 __skb_queue_tail(&sk->back_log, skb);
1559 return(0);
1560 }1561 }1562
1563 /*1564 * If this socket has got a reset it's to all intents and purposes 1565 * really dead. Count closed sockets as dead.1566 *1567 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD1568 * simply drops data. This seems incorrect as a 'closed' TCP doesn't1569 * exist so should cause resets as if the port was unreachable.1570 */1571
1572 if (sk->zapped || sk->state==TCP_CLOSE)
1573 gotono_tcp_socket;
1574
1575 if (!sk->prot)
1576 {1577 printk("IMPOSSIBLE 3\n");
1578 return(0);
1579 }1580
1581
1582 /*1583 * Charge the memory to the socket. 1584 */1585
1586 skb->sk=sk;
1587 sk->rmem_alloc += skb->truesize;
1588
1589 /*1590 * We should now do header prediction.1591 */1592
1593 /*1594 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We1595 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug1596 * compatibility. We also set up variables more thoroughly [Karn notes in the1597 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].1598 */1599
1600 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */1601 {1602
1603 /*1604 * Now deal with unusual cases.1605 */1606
1607 if(sk->state==TCP_LISTEN)
1608 {1609 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */1610 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1611
1612 /*1613 * We don't care for RST, and non SYN are absorbed (old segments)1614 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the1615 * netmask on a running connection it can go broadcast. Even Sun's have1616 * this problem so I'm ignoring it 1617 */1618
1619 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1620 {1621 kfree_skb(skb, FREE_READ);
1622 return 0;
1623 }1624
1625 /* 1626 * Guess we need to make a new socket up 1627 */1628
1629 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1630
1631 /*1632 * Now we have several options: In theory there is nothing else1633 * in the frame. KA9Q has an option to send data with the syn,1634 * BSD accepts data with the syn up to the [to be] advertised window1635 * and Solaris 2.1 gives you a protocol error. For now we just ignore1636 * it, that fits the spec precisely and avoids incompatibilities. It1637 * would be nice in future to drop through and process the data.1638 *1639 * Now TTCP is starting to use we ought to queue this data.1640 */1641
1642 return 0;
1643 }1644
1645 /* 1646 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN1647 * then its a new connection1648 */1649
1650 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1651 {1652 kfree_skb(skb, FREE_READ);
1653 return 0;
1654 }1655
1656 /*1657 * SYN sent means we have to look for a suitable ack and either reset1658 * for bad matches or go to connected. The SYN_SENT case is unusual and should1659 * not be in line code. [AC]1660 */1661
1662 if(sk->state==TCP_SYN_SENT)
1663 {1664 /* Crossed SYN or previous junk segment */1665 if(th->ack)
1666 {1667 /* We got an ack, but it's not a good ack */1668 if(!tcp_ack(sk,th,skb->ack_seq,len))
1669 {1670 /* Reset the ack - its an ack from a 1671 different connection [ th->rst is checked in tcp_send_reset()] */1672 tcp_statistics.TcpAttemptFails++;
1673 tcp_send_reset(daddr, saddr, th,
1674 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1675 kfree_skb(skb, FREE_READ);
1676 return(0);
1677 }1678 if(th->rst)
1679 returntcp_reset(sk,skb);
1680 if(!th->syn)
1681 {1682 /* A valid ack from a different connection1683 start. Shouldn't happen but cover it */1684 tcp_statistics.TcpAttemptFails++;
1685 tcp_send_reset(daddr, saddr, th,
1686 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1687 kfree_skb(skb, FREE_READ);
1688 return 0;
1689 }1690 /*1691 * Ok.. it's good. Set up sequence numbers and1692 * move to established.1693 */1694 syn_ok=1; /* Don't reset this connection for the syn */1695 sk->acked_seq = skb->seq+1;
1696 sk->lastwin_seq = skb->seq+1;
1697 sk->fin_seq = skb->seq;
1698 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1699 tcp_set_state(sk, TCP_ESTABLISHED);
1700 tcp_options(sk,th);
1701 sk->dummy_th.dest=th->source;
1702 sk->copied_seq = sk->acked_seq;
1703 if(!sk->dead)
1704 {1705 sk->state_change(sk);
1706 sock_wake_async(sk->socket, 0);
1707 }1708 if(sk->max_window==0)
1709 {1710 sk->max_window = 32;
1711 sk->mss = min(sk->max_window, sk->mtu);
1712 }1713 }1714 else1715 {1716 /* See if SYN's cross. Drop if boring */1717 if(th->syn && !th->rst)
1718 {1719 /* Crossed SYN's are fine - but talking to1720 yourself is right out... */1721 if(sk->saddr==saddr && sk->daddr==daddr &&
1722 sk->dummy_th.source==th->source &&
1723 sk->dummy_th.dest==th->dest)
1724 {1725 tcp_statistics.TcpAttemptFails++;
1726 returntcp_reset(sk,skb);
1727 }1728 tcp_set_state(sk,TCP_SYN_RECV);
1729
1730 /*1731 * FIXME:1732 * Must send SYN|ACK here1733 */1734 }1735 /* Discard junk segment */1736 kfree_skb(skb, FREE_READ);
1737 return 0;
1738 }1739 /*1740 * SYN_RECV with data maybe.. drop through1741 */1742 gotorfc_step6;
1743 }1744
1745 /*1746 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is1747 * a more complex suggestion for fixing these reuse issues in RFC16441748 * but not yet ready for general use. Also see RFC1379.1749 *1750 * Note the funny way we go back to the top of this function for1751 * this case ("goto try_next_socket"). That also takes care of1752 * checking "sk->users" for the new socket as well as doing all1753 * the normal tests on the packet.1754 */1755
1756 #defineBSD_TIME_WAIT1757 #ifdefBSD_TIME_WAIT1758 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1759 after(skb->seq, sk->acked_seq) && !th->rst)
1760 {1761 u32seq = sk->write_seq;
1762 if(sk->debug)
1763 printk("Doing a BSD time wait\n");
1764 tcp_statistics.TcpEstabResets++;
1765 sk->rmem_alloc -= skb->truesize;
1766 skb->sk = NULL;
1767 sk->err=ECONNRESET;
1768 tcp_set_state(sk, TCP_CLOSE);
1769 sk->shutdown = SHUTDOWN_MASK;
1770 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1771 /* this is not really correct: we should check sk->users */1772 if (sk && sk->state==TCP_LISTEN)
1773 {1774 skb->sk = sk;
1775 sk->rmem_alloc += skb->truesize;
1776 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1777 return 0;
1778 }1779 kfree_skb(skb, FREE_READ);
1780 return 0;
1781 }1782 #endif1783 }1784
1785 /*1786 * We are now in normal data flow (see the step list in the RFC)1787 * Note most of these are inline now. I'll inline the lot when1788 * I have time to test it hard and look at what gcc outputs 1789 */1790
1791 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1792 {1793 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1794 kfree_skb(skb, FREE_READ);
1795 return 0;
1796 }1797
1798 if(th->rst)
1799 returntcp_reset(sk,skb);
1800
1801 /*1802 * !syn_ok is effectively the state test in RFC793.1803 */1804
1805 if(th->syn && !syn_ok)
1806 {1807 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1808 returntcp_reset(sk,skb);
1809 }1810
1811 tcp_delack_estimator(sk);
1812
1813 /*1814 * Process the ACK1815 */1816
1817
1818 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1819 {1820 /*1821 * Our three way handshake failed.1822 */1823
1824 if(sk->state==TCP_SYN_RECV)
1825 {1826 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1827 }1828 kfree_skb(skb, FREE_READ);
1829 return 0;
1830 }1831
1832 rfc_step6: /* I'll clean this up later */1833
1834 /*1835 * If the accepted buffer put us over our queue size we1836 * now drop it (we must process the ack first to avoid1837 * deadlock cases).1838 */1839
1840 if (sk->rmem_alloc >= sk->rcvbuf)
1841 {1842 kfree_skb(skb, FREE_READ);
1843 return(0);
1844 }1845
1846
1847 /*1848 * Process urgent data1849 */1850
1851 tcp_urg(sk, th, len);
1852
1853 /*1854 * Process the encapsulated data1855 */1856
1857 if(tcp_data(skb,sk, saddr, len))
1858 kfree_skb(skb, FREE_READ);
1859
1860 /*1861 * And done1862 */1863
1864 return 0;
1865
1866 no_tcp_socket:
1867 /*1868 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)1869 */1870 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1871
1872 discard_it:
1873 /*1874 * Discard frame1875 */1876 skb->sk = NULL;
1877 kfree_skb(skb, FREE_READ);
1878 return 0;
1879 }