1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp_input.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * FIXES 23 * Pedro Roque : Double ACK bug 24 */ 25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 /* 30 * Policy code extracted so its now seperate 31 */ 32
33 /* 34 * Called each time to estimate the delayed ack timeout. This is 35 * how it should be done so a fast link isnt impacted by ack delay. 36 */ 37
38 extern__inline__voidtcp_delack_estimator(structsock *sk)
/* */ 39 { 40 /* 41 * Delayed ACK time estimator. 42 */ 43
44 if (sk->lrcvtime == 0)
45 { 46 sk->lrcvtime = jiffies;
47 sk->ato = HZ/3;
48 } 49 else 50 { 51 intm;
52
53 m = jiffies - sk->lrcvtime;
54
55 sk->lrcvtime = jiffies;
56
57 if (m <= 0)
58 m = 1;
59
60 if (m > (sk->rtt >> 3))
61 { 62 sk->ato = sk->rtt >> 3;
63 /* 64 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); 65 */ 66 } 67 else 68 { 69 sk->ato = (sk->ato >> 1) + m;
70 /* 71 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); 72 */ 73 } 74 } 75 } 76
77 /* 78 * Called on frames that were known _not_ to have been 79 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 80 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. 81 */ 82
83 extern__inline__voidtcp_rtt_estimator(structsock *sk, structsk_buff *oskb)
/* */ 84 { 85 longm;
86 /* 87 * The following amusing code comes from Jacobson's 88 * article in SIGCOMM '88. Note that rtt and mdev 89 * are scaled versions of rtt and mean deviation. 90 * This is designed to be as fast as possible 91 * m stands for "measurement". 92 */ 93
94 m = jiffies - oskb->when; /* RTT */ 95 if(m<=0)
96 m=1; /* IS THIS RIGHT FOR <0 ??? */ 97 m -= (sk->rtt >> 3); /* m is now error in rtt est */ 98 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ 99 if (m < 0)
100 m = -m; /* m is now abs(error) */ 101 m -= (sk->mdev >> 2); /* similar update on mdev */ 102 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 103
104 /* 105 * Now update timeout. Note that this removes any backoff. 106 */ 107
108 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
109 if (sk->rto > 120*HZ)
110 sk->rto = 120*HZ;
111 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ 112 sk->rto = HZ/5;
113 sk->backoff = 0;
114 } 115
116 /* 117 * Cached last hit socket 118 */ 119
120 staticvolatileunsignedlongth_cache_saddr, th_cache_daddr;
121 staticvolatileunsignedshortth_cache_dport, th_cache_sport;
122 staticvolatilestructsock *th_cache_sk;
123
124 voidtcp_cache_zap(void)
/* */ 125 { 126 th_cache_sk=NULL;
127 } 128
129 /* 130 * Find the socket, using the last hit cache if applicable. The cache is not quite 131 * right... 132 */ 133
134 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */ 135 { 136 structsock * sk;
137
138 sk = (structsock *) th_cache_sk;
139 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
140 sport != th_cache_sport || dport != th_cache_dport) { 141 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
142 if (sk) { 143 th_cache_saddr=saddr;
144 th_cache_daddr=daddr;
145 th_cache_dport=dport;
146 th_cache_sport=sport;
147 th_cache_sk=sk;
148 } 149 } 150 returnsk;
151 } 152
153 /* 154 * React to a out-of-window TCP sequence number in an incoming packet 155 */ 156
157 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */ 158 structoptions *opt, unsignedlongsaddr, structdevice *dev)
159 { 160 if (th->rst)
161 return;
162
163 /* 164 * Send a reset if we get something not ours and we are 165 * unsynchronized. Note: We don't do anything to our end. We 166 * are just killing the bogus remote connection then we will 167 * connect again and it will work (with luck). 168 */ 169
170 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
171 { 172 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
173 return;
174 } 175
176 /* 177 * 4.3reno machines look for these kind of acks so they can do fast 178 * recovery. Three identical 'old' acks lets it know that one frame has 179 * been lost and should be resent. Because this is before the whole window 180 * of data has timed out it can take one lost frame per window without 181 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2] 182 * 183 * We also should be spotting triple bad sequences. 184 */ 185 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
186 return;
187 } 188
189 /* 190 * This functions checks to see if the tcp header is actually acceptable. 191 */ 192
193 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */ 194 { 195 u32end_window = sk->acked_seq + sk->window;
196 return/* if start is at end of window, end must be too (zero window) */ 197 (seq == end_window && seq == end_seq) ||
198 /* if start is before end of window, check for interest */ 199 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
200 } 201
202 /* 203 * When we get a reset we do this. This probably is a tcp_output routine 204 * really. 205 */ 206
207 staticinttcp_reset(structsock *sk, structsk_buff *skb)
/* */ 208 { 209 sk->zapped = 1;
210 /* 211 * We want the right error as BSD sees it (and indeed as we do). 212 */ 213 sk->err = ECONNRESET;
214 if (sk->state == TCP_SYN_SENT)
215 sk->err = ECONNREFUSED;
216 if (sk->state == TCP_CLOSE_WAIT)
217 sk->err = EPIPE;
218 #ifdef CONFIG_TCP_RFC1337
219 /* 220 * Time wait assassination protection [RFC1337] 221 * 222 * This is a good idea, but causes more sockets to take time to close. 223 * 224 * Ian Heavens has since shown this is an inadequate fix for the protocol 225 * bug in question. 226 */ 227 if(sk->state!=TCP_TIME_WAIT)
228 { 229 tcp_set_state(sk,TCP_CLOSE);
230 sk->shutdown = SHUTDOWN_MASK;
231 } 232 #else 233 tcp_set_state(sk,TCP_CLOSE);
234 sk->shutdown = SHUTDOWN_MASK;
235 #endif 236 if (!sk->dead)
237 sk->state_change(sk);
238 kfree_skb(skb, FREE_READ);
239 return(0);
240 } 241
242
243 /* 244 * Look for tcp options. Parses everything but only knows about MSS. 245 * This routine is always called with the packet containing the SYN. 246 * However it may also be called with the ack to the SYN. So you 247 * can't assume this is always the SYN. It's always called after 248 * we have set up sk->mtu to our own MTU. 249 * 250 * We need at minimum to add PAWS support here. Possibly large windows 251 * as Linux gets deployed on 100Mb/sec networks. 252 */ 253
254 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */ 255 { 256 unsignedchar *ptr;
257 intlength=(th->doff*4)-sizeof(structtcphdr);
258 intmss_seen = 0;
259
260 ptr = (unsignedchar *)(th + 1);
261
262 while(length>0)
263 { 264 intopcode=*ptr++;
265 intopsize=*ptr++;
266 switch(opcode)
267 { 268 caseTCPOPT_EOL:
269 return;
270 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 271 length--;
272 ptr--; /* the opsize=*ptr++ above was a mistake */ 273 continue;
274
275 default:
276 if(opsize<=2) /* Avoid silly options looping forever */ 277 return;
278 switch(opcode)
279 { 280 caseTCPOPT_MSS:
281 if(opsize==4 && th->syn)
282 { 283 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
284 mss_seen = 1;
285 } 286 break;
287 /* Add other options here as people feel the urge to implement stuff like large windows */ 288 } 289 ptr+=opsize-2;
290 length-=opsize;
291 } 292 } 293 if (th->syn)
294 { 295 if (! mss_seen)
296 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ 297 } 298 #ifdefCONFIG_INET_PCTCP 299 sk->mss = min(sk->max_window >> 1, sk->mtu);
300 #else 301 sk->mss = min(sk->max_window, sk->mtu);
302 sk->max_unacked = 2 * sk->mss;
303 #endif 304 } 305
306
307 /* 308 * This routine handles a connection request. 309 * It should make sure we haven't already responded. 310 * Because of the way BSD works, we have to send a syn/ack now. 311 * This also means it will be harder to close a socket which is 312 * listening. 313 */ 314
315 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */ 316 u32daddr, u32saddr, structoptions *opt, structdevice *dev, u32seq)
317 { 318 structsock *newsk;
319 structtcphdr *th;
320 structrtable *rt;
321
322 th = skb->h.th;
323
324 /* If the socket is dead, don't accept the connection. */ 325 if (!sk->dead)
326 { 327 sk->data_ready(sk,0);
328 } 329 else 330 { 331 if(sk->debug)
332 printk("Reset on %p: Connect on dead socket.\n",sk);
333 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
334 tcp_statistics.TcpAttemptFails++;
335 kfree_skb(skb, FREE_READ);
336 return;
337 } 338
339 /* 340 * Make sure we can accept more. This will prevent a 341 * flurry of syns from eating up all our memory. 342 * 343 * BSD does some funnies here and allows 3/2 times the 344 * set backlog as a fudge factor. Thats just too gross. 345 */ 346
347 if (sk->ack_backlog >= sk->max_ack_backlog)
348 { 349 tcp_statistics.TcpAttemptFails++;
350 kfree_skb(skb, FREE_READ);
351 return;
352 } 353
354 /* 355 * We need to build a new sock struct. 356 * It is sort of bad to have a socket without an inode attached 357 * to it, but the wake_up's will just wake up the listening socket, 358 * and if the listening socket is destroyed before this is taken 359 * off of the queue, this will take care of it. 360 */ 361
362 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
363 if (newsk == NULL)
364 { 365 /* just ignore the syn. It will get retransmitted. */ 366 tcp_statistics.TcpAttemptFails++;
367 kfree_skb(skb, FREE_READ);
368 return;
369 } 370
371 memcpy(newsk, sk, sizeof(*newsk));
372 newsk->opt = NULL;
373 newsk->ip_route_cache = NULL;
374 if (opt && opt->optlen)
375 { 376 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
377 if (!sk->opt)
378 { 379 kfree_s(newsk, sizeof(structsock));
380 tcp_statistics.TcpAttemptFails++;
381 kfree_skb(skb, FREE_READ);
382 return;
383 } 384 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
385 { 386 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
387 kfree_s(newsk, sizeof(structsock));
388 tcp_statistics.TcpAttemptFails++;
389 kfree_skb(skb, FREE_READ);
390 return;
391 } 392 } 393 skb_queue_head_init(&newsk->write_queue);
394 skb_queue_head_init(&newsk->receive_queue);
395 newsk->send_head = NULL;
396 newsk->send_tail = NULL;
397 skb_queue_head_init(&newsk->back_log);
398 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ 399 newsk->rto = TCP_TIMEOUT_INIT;
400 newsk->mdev = 0;
401 newsk->max_window = 0;
402 newsk->cong_window = 1;
403 newsk->cong_count = 0;
404 newsk->ssthresh = 0;
405 newsk->backoff = 0;
406 newsk->blog = 0;
407 newsk->intr = 0;
408 newsk->proc = 0;
409 newsk->done = 0;
410 newsk->partial = NULL;
411 newsk->pair = NULL;
412 newsk->wmem_alloc = 0;
413 newsk->rmem_alloc = 0;
414 newsk->localroute = sk->localroute;
415
416 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
417
418 newsk->err = 0;
419 newsk->shutdown = 0;
420 newsk->ack_backlog = 0;
421 newsk->acked_seq = skb->seq+1;
422 newsk->lastwin_seq = skb->seq+1;
423 newsk->delay_acks = 1;
424 newsk->copied_seq = skb->seq+1;
425 newsk->fin_seq = skb->seq;
426 newsk->state = TCP_SYN_RECV;
427 newsk->timeout = 0;
428 newsk->ip_xmit_timeout = 0;
429 newsk->write_seq = seq;
430 newsk->window_seq = newsk->write_seq;
431 newsk->rcv_ack_seq = newsk->write_seq;
432 newsk->urg_data = 0;
433 newsk->retransmits = 0;
434 newsk->linger=0;
435 newsk->destroy = 0;
436 init_timer(&newsk->timer);
437 newsk->timer.data = (unsignedlong)newsk;
438 newsk->timer.function = &net_timer;
439 init_timer(&newsk->retransmit_timer);
440 newsk->retransmit_timer.data = (unsignedlong)newsk;
441 newsk->retransmit_timer.function=&tcp_retransmit_timer;
442 newsk->dummy_th.source = skb->h.th->dest;
443 newsk->dummy_th.dest = skb->h.th->source;
444
445 /* 446 * Swap these two, they are from our point of view. 447 */ 448
449 newsk->daddr = saddr;
450 newsk->saddr = daddr;
451 newsk->rcv_saddr = daddr;
452
453 put_sock(newsk->num,newsk);
454 newsk->acked_seq = skb->seq + 1;
455 newsk->copied_seq = skb->seq + 1;
456 newsk->socket = NULL;
457
458 /* 459 * Grab the ttl and tos values and use them 460 */ 461
462 newsk->ip_ttl=sk->ip_ttl;
463 newsk->ip_tos=skb->ip_hdr->tos;
464
465 /* 466 * Use 512 or whatever user asked for 467 */ 468
469 /* 470 * Note use of sk->user_mss, since user has no direct access to newsk 471 */ 472
473 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
474 newsk->ip_route_cache = rt;
475
476 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
477 newsk->window_clamp = rt->rt_window;
478 else 479 newsk->window_clamp = 0;
480
481 if (sk->user_mss)
482 newsk->mtu = sk->user_mss;
483 elseif (rt)
484 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
485 else 486 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
487
488 /* 489 * But not bigger than device MTU 490 */ 491
492 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
493
494 #ifdefCONFIG_SKIP 495
496 /* 497 * SKIP devices set their MTU to 65535. This is so they can take packets 498 * unfragmented to security process then fragment. They could lie to the 499 * TCP layer about a suitable MTU, but its easier to let skip sort it out 500 * simply because the final package we want unfragmented is going to be 501 * 502 * [IPHDR][IPSP][Security data][Modified TCP data][Security data] 503 */ 504
505 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ 506 sk->mtu=skip_pick_mtu(sk->mtu,dev);
507 #endif 508 /* 509 * This will min with what arrived in the packet 510 */ 511
512 tcp_options(newsk,skb->h.th);
513
514 tcp_cache_zap();
515 tcp_send_synack(newsk, sk, skb);
516 } 517
518
519 /* 520 * Handle a TCP window that shrunk on us. It shouldn't happen, 521 * but.. 522 * 523 * We may need to move packets from the send queue 524 * to the write queue, if the window has been shrunk on us. 525 * The RFC says you are not allowed to shrink your window 526 * like this, but if the other end does, you must be able 527 * to deal with it. 528 */ 529 voidtcp_window_shrunk(structsock * sk, u32window_seq)
/* */ 530 { 531 structsk_buff *skb;
532 structsk_buff *skb2;
533 structsk_buff *wskb = NULL;
534
535 skb2 = sk->send_head;
536 sk->send_head = NULL;
537 sk->send_tail = NULL;
538
539 /* 540 * This is an artifact of a flawed concept. We want one 541 * queue and a smarter send routine when we send all. 542 */ 543 cli();
544 while (skb2 != NULL)
545 { 546 skb = skb2;
547 skb2 = skb->link3;
548 skb->link3 = NULL;
549 if (after(skb->end_seq, window_seq))
550 { 551 if (sk->packets_out > 0)
552 sk->packets_out--;
553 /* We may need to remove this from the dev send list. */ 554 if (skb->next != NULL)
555 { 556 skb_unlink(skb);
557 } 558 /* Now add it to the write_queue. */ 559 if (wskb == NULL)
560 skb_queue_head(&sk->write_queue,skb);
561 else 562 skb_append(wskb,skb);
563 wskb = skb;
564 } 565 else 566 { 567 if (sk->send_head == NULL)
568 { 569 sk->send_head = skb;
570 sk->send_tail = skb;
571 } 572 else 573 { 574 sk->send_tail->link3 = skb;
575 sk->send_tail = skb;
576 } 577 skb->link3 = NULL;
578 } 579 } 580 sti();
581 } 582
583
584 /* 585 * This routine deals with incoming acks, but not outgoing ones. 586 * 587 * This routine is totally _WRONG_. The list structuring is wrong, 588 * the algorithm is wrong, the code is wrong. 589 */ 590
591 staticinttcp_ack(structsock *sk, structtcphdr *th, u32ack, intlen)
/* */ 592 { 593 intflag = 0;
594 u32window_seq;
595
596 /* 597 * 1 - there was data in packet as well as ack or new data is sent or 598 * in shutdown state 599 * 2 - data from retransmit queue was acked and removed 600 * 4 - window shrunk or data from retransmit queue was acked and removed 601 */ 602
603 if(sk->zapped)
604 return(1); /* Dead, cant ack any more so why bother */ 605
606 /* 607 * We have dropped back to keepalive timeouts. Thus we have 608 * no retransmits pending. 609 */ 610
611 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
612 sk->retransmits = 0;
613
614 /* 615 * If the ack is newer than sent or older than previous acks 616 * then we can probably ignore it. 617 */ 618
619 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
620 gotouninteresting_ack;
621
622 /* 623 * If there is data set flag 1 624 */ 625
626 if (len != th->doff*4)
627 flag |= 1;
628
629 /* 630 * Have we discovered a larger window 631 */ 632 window_seq = ntohs(th->window);
633 if (window_seq > sk->max_window)
634 { 635 sk->max_window = window_seq;
636 #ifdefCONFIG_INET_PCTCP 637 /* Hack because we don't send partial packets to non SWS 638 handling hosts */ 639 sk->mss = min(window_seq>>1, sk->mtu);
640 #else 641 sk->mss = min(window_seq, sk->mtu);
642 #endif 643 } 644 window_seq += ack;
645
646 /* 647 * See if our window has been shrunk. 648 */ 649 if (after(sk->window_seq, window_seq)) { 650 flag |= 4;
651 tcp_window_shrunk(sk, window_seq);
652 } 653
654 /* 655 * Update the right hand window edge of the host 656 */ 657 sk->window_seq = window_seq;
658
659 /* 660 * Pipe has emptied 661 */ 662 if (sk->send_tail == NULL || sk->send_head == NULL)
663 { 664 sk->send_head = NULL;
665 sk->send_tail = NULL;
666 sk->packets_out= 0;
667 } 668
669 /* 670 * We don't want too many packets out there. 671 */ 672
673 if (sk->ip_xmit_timeout == TIME_WRITE &&
674 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
675 { 676
677 /* 678 * This is Jacobson's slow start and congestion avoidance. 679 * SIGCOMM '88, p. 328. Because we keep cong_window in integral 680 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 681 * counter and increment it once every cwnd times. It's possible 682 * that this should be done only if sk->retransmits == 0. I'm 683 * interpreting "new data is acked" as including data that has 684 * been retransmitted but is just now being acked. 685 */ 686 if (sk->cong_window < sk->ssthresh)
687 /* 688 * In "safe" area, increase 689 */ 690 sk->cong_window++;
691 else 692 { 693 /* 694 * In dangerous area, increase slowly. In theory this is 695 * sk->cong_window += 1 / sk->cong_window 696 */ 697 if (sk->cong_count >= sk->cong_window)
698 { 699 sk->cong_window++;
700 sk->cong_count = 0;
701 } 702 else 703 sk->cong_count++;
704 } 705 } 706
707 /* 708 * Remember the highest ack received. 709 */ 710
711 sk->rcv_ack_seq = ack;
712
713 /* 714 * We passed data and got it acked, remove any soft error 715 * log. Something worked... 716 */ 717
718 sk->err_soft = 0;
719
720 /* 721 * If this ack opens up a zero window, clear backoff. It was 722 * being used to time the probes, and is probably far higher than 723 * it needs to be for normal retransmission. 724 */ 725
726 if (sk->ip_xmit_timeout == TIME_PROBE0)
727 { 728 sk->retransmits = 0; /* Our probe was answered */ 729
730 /* 731 * Was it a usable window open ? 732 */ 733
734 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */ 735 ! before (sk->window_seq, sk->write_queue.next->end_seq))
736 { 737 sk->backoff = 0;
738
739 /* 740 * Recompute rto from rtt. this eliminates any backoff. 741 */ 742
743 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
744 if (sk->rto > 120*HZ)
745 sk->rto = 120*HZ;
746 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about 747 .2 of a second because of BSD delayed acks - on a 100Mb/sec link 748 .2 of a second is going to need huge windows (SIGH) */ 749 sk->rto = HZ/5;
750 } 751 } 752
753 /* 754 * See if we can take anything off of the retransmit queue. 755 */ 756
757 while(sk->send_head != NULL)
758 { 759 /* Check for a bug. */ 760 if (sk->send_head->link3 &&
761 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
762 printk("INET: tcp.c: *** bug send_list out of order.\n");
763
764 /* 765 * If our packet is before the ack sequence we can 766 * discard it as it's confirmed to have arrived the other end. 767 */ 768
769 if (before(sk->send_head->end_seq, ack+1))
770 { 771 structsk_buff *oskb;
772 if (sk->retransmits)
773 { 774 /* 775 * We were retransmitting. don't count this in RTT est 776 */ 777 flag |= 2;
778
779 /* 780 * even though we've gotten an ack, we're still 781 * retransmitting as long as we're sending from 782 * the retransmit queue. Keeping retransmits non-zero 783 * prevents us from getting new data interspersed with 784 * retransmissions. 785 */ 786
787 if (sk->send_head->link3) /* Any more queued retransmits? */ 788 sk->retransmits = 1;
789 else 790 sk->retransmits = 0;
791 } 792 /* 793 * Note that we only reset backoff and rto in the 794 * rtt recomputation code. And that doesn't happen 795 * if there were retransmissions in effect. So the 796 * first new packet after the retransmissions is 797 * sent with the backoff still in effect. Not until 798 * we get an ack from a non-retransmitted packet do 799 * we reset the backoff and rto. This allows us to deal 800 * with a situation where the network delay has increased 801 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) 802 */ 803
804 /* 805 * We have one less packet out there. 806 */ 807
808 if (sk->packets_out > 0)
809 sk->packets_out --;
810
811 oskb = sk->send_head;
812
813 if (!(flag&2)) /* Not retransmitting */ 814 tcp_rtt_estimator(sk,oskb);
815 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 816 In this case as we just set it up */ 817 cli();
818 oskb = sk->send_head;
819 IS_SKB(oskb);
820 sk->send_head = oskb->link3;
821 if (sk->send_head == NULL)
822 { 823 sk->send_tail = NULL;
824 } 825
826 /* 827 * We may need to remove this from the dev send list. 828 */ 829
830 if (oskb->next)
831 skb_unlink(oskb);
832 sti();
833 kfree_skb(oskb, FREE_WRITE); /* write. */ 834 if (!sk->dead)
835 sk->write_space(sk);
836 } 837 else 838 { 839 break;
840 } 841 } 842
843 /* 844 * XXX someone ought to look at this too.. at the moment, if skb_peek() 845 * returns non-NULL, we complete ignore the timer stuff in the else 846 * clause. We ought to organize the code so that else clause can 847 * (should) be executed regardless, possibly moving the PROBE timer 848 * reset over. The skb_peek() thing should only move stuff to the 849 * write queue, NOT also manage the timer functions. 850 */ 851
852 /* 853 * Maybe we can take some stuff off of the write queue, 854 * and put it onto the xmit queue. 855 */ 856 if (skb_peek(&sk->write_queue) != NULL)
857 { 858 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
859 (sk->retransmits == 0 ||
860 sk->ip_xmit_timeout != TIME_WRITE ||
861 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
862 && sk->packets_out < sk->cong_window)
863 { 864 /* 865 * Add more data to the send queue. 866 */ 867 flag |= 1;
868 tcp_write_xmit(sk);
869 } 870 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
871 sk->send_head == NULL &&
872 sk->ack_backlog == 0 &&
873 sk->state != TCP_TIME_WAIT)
874 { 875 /* 876 * Data to queue but no room. 877 */ 878 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
879 } 880 } 881 else 882 { 883 /* 884 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets 885 * from TCP_CLOSE we don't do anything 886 * 887 * from anything else, if there is write data (or fin) pending, 888 * we use a TIME_WRITE timeout, else if keepalive we reset to 889 * a KEEPALIVE timeout, else we delete the timer. 890 * 891 * We do not set flag for nominal write data, otherwise we may 892 * force a state where we start to write itsy bitsy tidbits 893 * of data. 894 */ 895
896 switch(sk->state) { 897 caseTCP_TIME_WAIT:
898 /* 899 * keep us in TIME_WAIT until we stop getting packets, 900 * reset the timeout. 901 */ 902 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
903 break;
904 caseTCP_CLOSE:
905 /* 906 * don't touch the timer. 907 */ 908 break;
909 default:
910 /* 911 * Must check send_head, write_queue, and ack_backlog 912 * to determine which timeout to use. 913 */ 914 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) { 915 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
916 }elseif (sk->keepopen) { 917 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
918 }else{ 919 del_timer(&sk->retransmit_timer);
920 sk->ip_xmit_timeout = 0;
921 } 922 break;
923 } 924 } 925
926 /* 927 * We have nothing queued but space to send. Send any partial 928 * packets immediately (end of Nagle rule application). 929 */ 930
931 if (sk->packets_out == 0 && sk->partial != NULL &&
932 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
933 { 934 flag |= 1;
935 tcp_send_partial(sk);
936 } 937
938 /* 939 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and 940 * we are now waiting for an acknowledge to our FIN. The other end is 941 * already in TIME_WAIT. 942 * 943 * Move to TCP_CLOSE on success. 944 */ 945
946 if (sk->state == TCP_LAST_ACK)
947 { 948 if (!sk->dead)
949 sk->state_change(sk);
950 if(sk->debug)
951 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
952 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
953 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
954 { 955 flag |= 1;
956 sk->shutdown = SHUTDOWN_MASK;
957 tcp_set_state(sk,TCP_CLOSE);
958 return 1;
959 } 960 } 961
962 /* 963 * Incoming ACK to a FIN we sent in the case of our initiating the close. 964 * 965 * Move to FIN_WAIT2 to await a FIN from the other end. Set 966 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. 967 */ 968
969 if (sk->state == TCP_FIN_WAIT1)
970 { 971
972 if (!sk->dead)
973 sk->state_change(sk);
974 if (sk->rcv_ack_seq == sk->write_seq)
975 { 976 flag |= 1;
977 sk->shutdown |= SEND_SHUTDOWN;
978 tcp_set_state(sk, TCP_FIN_WAIT2);
979 } 980 } 981
982 /* 983 * Incoming ACK to a FIN we sent in the case of a simultaneous close. 984 * 985 * Move to TIME_WAIT 986 */ 987
988 if (sk->state == TCP_CLOSING)
989 { 990
991 if (!sk->dead)
992 sk->state_change(sk);
993 if (sk->rcv_ack_seq == sk->write_seq)
994 { 995 flag |= 1;
996 tcp_time_wait(sk);
997 } 998 } 999
1000 /*1001 * Final ack of a three way shake 1002 */1003
1004 if(sk->state==TCP_SYN_RECV)
1005 {1006 tcp_set_state(sk, TCP_ESTABLISHED);
1007 tcp_options(sk,th);
1008 sk->dummy_th.dest=th->source;
1009 sk->copied_seq = sk->acked_seq;
1010 if(!sk->dead)
1011 sk->state_change(sk);
1012 if(sk->max_window==0)
1013 {1014 sk->max_window=32; /* Sanity check */1015 sk->mss=min(sk->max_window,sk->mtu);
1016 }1017 }1018
1019 /*1020 * I make no guarantees about the first clause in the following1021 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under1022 * what conditions "!flag" would be true. However I think the rest1023 * of the conditions would prevent that from causing any1024 * unnecessary retransmission. 1025 * Clearly if the first packet has expired it should be 1026 * retransmitted. The other alternative, "flag&2 && retransmits", is1027 * harder to explain: You have to look carefully at how and when the1028 * timer is set and with what timeout. The most recent transmission always1029 * sets the timer. So in general if the most recent thing has timed1030 * out, everything before it has as well. So we want to go ahead and1031 * retransmit some more. If we didn't explicitly test for this1032 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"1033 * would not be true. If you look at the pattern of timing, you can1034 * show that rto is increased fast enough that the next packet would1035 * almost never be retransmitted immediately. Then you'd end up1036 * waiting for a timeout to send each packet on the retransmission1037 * queue. With my implementation of the Karn sampling algorithm,1038 * the timeout would double each time. The net result is that it would1039 * take a hideous amount of time to recover from a single dropped packet.1040 * It's possible that there should also be a test for TIME_WRITE, but1041 * I think as long as "send_head != NULL" and "retransmit" is on, we've1042 * got to be in real retransmission mode.1043 * Note that tcp_do_retransmit is called with all==1. Setting cong_window1044 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.1045 * As long as no further losses occur, this seems reasonable.1046 */1047
1048 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1049 (((flag&2) && sk->retransmits) ||
1050 (sk->send_head->when + sk->rto < jiffies)))
1051 {1052 if(sk->send_head->when + sk->rto < jiffies)
1053 tcp_retransmit(sk,0);
1054 else1055 {1056 tcp_do_retransmit(sk, 1);
1057 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1058 }1059 }1060
1061 return 1;
1062
1063 uninteresting_ack:
1064 if(sk->debug)
1065 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1066
1067 /*1068 * Keepalive processing.1069 */1070
1071 if (after(ack, sk->sent_seq))
1072 {1073 return 0;
1074 }1075
1076 /*1077 * Restart the keepalive timer.1078 */1079
1080 if (sk->keepopen)
1081 {1082 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1083 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1084 }1085 return 1;
1086 }1087
1088
1089 /*1090 * Process the FIN bit. This now behaves as it is supposed to work1091 * and the FIN takes effect when it is validly part of sequence1092 * space. Not before when we get holes.1093 *1094 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT1095 * (and thence onto LAST-ACK and finally, CLOSE, we never enter1096 * TIME-WAIT)1097 *1098 * If we are in FINWAIT-1, a received FIN indicates simultaneous1099 * close and we go into CLOSING (and later onto TIME-WAIT)1100 *1101 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.1102 *1103 */1104
1105 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */1106 {1107 sk->fin_seq = skb->end_seq;
1108
1109 if (!sk->dead)
1110 {1111 sk->state_change(sk);
1112 sock_wake_async(sk->socket, 1);
1113 }1114
1115 switch(sk->state)
1116 {1117 caseTCP_SYN_RECV:
1118 caseTCP_SYN_SENT:
1119 caseTCP_ESTABLISHED:
1120 /*1121 * move to CLOSE_WAIT, tcp_data() already handled1122 * sending the ack.1123 */1124 tcp_set_state(sk,TCP_CLOSE_WAIT);
1125 if (th->rst)
1126 sk->shutdown = SHUTDOWN_MASK;
1127 break;
1128
1129 caseTCP_CLOSE_WAIT:
1130 caseTCP_CLOSING:
1131 /*1132 * received a retransmission of the FIN, do1133 * nothing.1134 */1135 break;
1136 caseTCP_TIME_WAIT:
1137 /*1138 * received a retransmission of the FIN,1139 * restart the TIME_WAIT timer.1140 */1141 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1142 return(0);
1143 caseTCP_FIN_WAIT1:
1144 /*1145 * This case occurs when a simultaneous close1146 * happens, we must ack the received FIN and1147 * enter the CLOSING state.1148 *1149 * This causes a WRITE timeout, which will either1150 * move on to TIME_WAIT when we timeout, or resend1151 * the FIN properly (maybe we get rid of that annoying1152 * FIN lost hang). The TIME_WRITE code is already correct1153 * for handling this timeout.1154 */1155
1156 if(sk->ip_xmit_timeout != TIME_WRITE)
1157 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1158 tcp_set_state(sk,TCP_CLOSING);
1159 break;
1160 caseTCP_FIN_WAIT2:
1161 /*1162 * received a FIN -- send ACK and enter TIME_WAIT1163 */1164 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1165 sk->shutdown|=SHUTDOWN_MASK;
1166 tcp_set_state(sk,TCP_TIME_WAIT);
1167 break;
1168 caseTCP_CLOSE:
1169 /*1170 * already in CLOSE1171 */1172 break;
1173 default:
1174 tcp_set_state(sk,TCP_LAST_ACK);
1175
1176 /* Start the timers. */1177 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1178 return(0);
1179 }1180
1181 return(0);
1182 }1183
1184
1185
1186 /*1187 * This routine handles the data. If there is room in the buffer,1188 * it will be have already been moved into it. If there is no1189 * room, then we will just have to discard the packet.1190 */1191
1192 staticinttcp_data(structsk_buff *skb, structsock *sk,
/* */1193 unsignedlongsaddr, unsignedshortlen)
1194 {1195 structsk_buff *skb1, *skb2;
1196 structtcphdr *th;
1197 intdup_dumped=0;
1198 u32new_seq, shut_seq;
1199
1200 th = skb->h.th;
1201 skb_pull(skb,th->doff*4);
1202 skb_trim(skb,len-(th->doff*4));
1203
1204 /*1205 * The bytes in the receive read/assembly queue has increased. Needed for the1206 * low memory discard algorithm 1207 */1208
1209 sk->bytes_rcv += skb->len;
1210
1211 if (skb->len == 0 && !th->fin)
1212 {1213 /* 1214 * Don't want to keep passing ack's back and forth. 1215 * (someone sent us dataless, boring frame)1216 */1217 if (!th->ack)
1218 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1219 kfree_skb(skb, FREE_READ);
1220 return(0);
1221 }1222
1223 /*1224 * We no longer have anyone receiving data on this connection.1225 */1226
1227 #ifndef TCP_DONT_RST_SHUTDOWN
1228
1229 if(sk->shutdown & RCV_SHUTDOWN)
1230 {1231 /*1232 * FIXME: BSD has some magic to avoid sending resets to1233 * broken 4.2 BSD keepalives. Much to my surprise a few non1234 * BSD stacks still have broken keepalives so we want to1235 * cope with it.1236 */1237
1238 if(skb->len) /* We don't care if it's just an ack or1239 a keepalive/window probe */1240 {1241 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */1242
1243 /* Do this the way 4.4BSD treats it. Not what I'd1244 regard as the meaning of the spec but it's what BSD1245 does and clearly they know everything 8) */1246
1247 /*1248 * This is valid because of two things1249 *1250 * a) The way tcp_data behaves at the bottom.1251 * b) A fin takes effect when read not when received.1252 */1253
1254 shut_seq = sk->acked_seq+1; /* Last byte */1255
1256 if(after(new_seq,shut_seq))
1257 {1258 if(sk->debug)
1259 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1260 sk, new_seq, shut_seq, sk->blog);
1261 if(sk->dead)
1262 {1263 sk->acked_seq = new_seq + th->fin;
1264 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1265 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1266 tcp_statistics.TcpEstabResets++;
1267 sk->err = EPIPE;
1268 sk->error_report(sk);
1269 sk->shutdown = SHUTDOWN_MASK;
1270 tcp_set_state(sk,TCP_CLOSE);
1271 kfree_skb(skb, FREE_READ);
1272 return 0;
1273 }1274 }1275 }1276 }1277
1278 #endif1279
1280 /*1281 * Now we have to walk the chain, and figure out where this one1282 * goes into it. This is set up so that the last packet we received1283 * will be the first one we look at, that way if everything comes1284 * in order, there will be no performance loss, and if they come1285 * out of order we will be able to fit things in nicely.1286 *1287 * [AC: This is wrong. We should assume in order first and then walk1288 * forwards from the first hole based upon real traffic patterns.]1289 * 1290 */1291
1292 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */1293 {1294 skb_queue_head(&sk->receive_queue,skb);
1295 skb1= NULL;
1296 }1297 else1298 {1299 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
1300 {1301 if(sk->debug)
1302 {1303 printk("skb1=%p :", skb1);
1304 printk("skb1->seq = %d: ", skb1->seq);
1305 printk("skb->seq = %d\n",skb->seq);
1306 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
1307 sk->acked_seq);
1308 }1309
1310 /*1311 * Optimisation: Duplicate frame or extension of previous frame from1312 * same sequence point (lost ack case).1313 * The frame contains duplicate data or replaces a previous frame1314 * discard the previous frame (safe as sk->users is set) and put1315 * the new one in its place.1316 */1317
1318 if (skb->seq==skb1->seq && skb->len>=skb1->len)
1319 {1320 skb_append(skb1,skb);
1321 skb_unlink(skb1);
1322 kfree_skb(skb1,FREE_READ);
1323 dup_dumped=1;
1324 skb1=NULL;
1325 break;
1326 }1327
1328 /*1329 * Found where it fits1330 */1331
1332 if (after(skb->seq+1, skb1->seq))
1333 {1334 skb_append(skb1,skb);
1335 break;
1336 }1337
1338 /*1339 * See if we've hit the start. If so insert.1340 */1341 if (skb1 == skb_peek(&sk->receive_queue))
1342 {1343 skb_queue_head(&sk->receive_queue, skb);
1344 break;
1345 }1346 }1347 }1348
1349 /*1350 * Figure out what the ack value for this frame is1351 */1352
1353 if (before(sk->acked_seq, sk->copied_seq))
1354 {1355 printk("*** tcp.c:tcp_data bug acked < copied\n");
1356 sk->acked_seq = sk->copied_seq;
1357 }1358
1359 /*1360 * Now figure out if we can ack anything. This is very messy because we really want two1361 * receive queues, a completed and an assembly queue. We also want only one transmit1362 * queue.1363 */1364
1365 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
1366 {1367 if (before(skb->seq, sk->acked_seq+1))
1368 {1369
1370 if (after(skb->end_seq, sk->acked_seq))
1371 sk->acked_seq = skb->end_seq;
1372
1373 skb->acked = 1;
1374
1375 /*1376 * When we ack the fin, we do the FIN 1377 * processing.1378 */1379
1380 if (skb->h.th->fin)
1381 {1382 tcp_fin(skb,sk,skb->h.th);
1383 }1384
1385 for(skb2 = skb->next;
1386 skb2 != (structsk_buff *)&sk->receive_queue;
1387 skb2 = skb2->next)
1388 {1389 if (before(skb2->seq, sk->acked_seq+1))
1390 {1391 if (after(skb2->end_seq, sk->acked_seq))
1392 sk->acked_seq = skb2->end_seq;
1393
1394 skb2->acked = 1;
1395 /*1396 * When we ack the fin, we do1397 * the fin handling.1398 */1399 if (skb2->h.th->fin)
1400 {1401 tcp_fin(skb,sk,skb->h.th);
1402 }1403
1404 /*1405 * Force an immediate ack.1406 */1407
1408 sk->ack_backlog = sk->max_ack_backlog;
1409 }1410 else1411 {1412 break;
1413 }1414 }1415
1416 /*1417 * This also takes care of updating the window.1418 * This if statement needs to be simplified.1419 *1420 * rules for delaying an ack:1421 * - delay time <= 0.5 HZ1422 * - we don't have a window update to send1423 * - must send at least every 2 full sized packets1424 */1425 if (!sk->delay_acks ||
1426 /* sk->ack_backlog >= sk->max_ack_backlog || */1427 sk->bytes_rcv > sk->max_unacked || th->fin ||
1428 sk->ato > HZ/2 ||
1429 tcp_raise_window(sk)) {1430 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr);
1431 }1432 else1433 {1434 sk->ack_backlog++;
1435
1436 if(sk->debug)
1437 printk("Ack queued.\n");
1438
1439 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);
1440
1441 }1442 }1443 }1444
1445 /*1446 * If we've missed a packet, send an ack.1447 * Also start a timer to send another.1448 */1449
1450 if (!skb->acked)
1451 {1452
1453 /*1454 * This is important. If we don't have much room left,1455 * we need to throw out a few packets so we have a good1456 * window. Note that mtu is used, not mss, because mss is really1457 * for the send side. He could be sending us stuff as large as mtu.1458 */1459
1460 while (sock_rspace(sk) < sk->mtu)
1461 {1462 skb1 = skb_peek(&sk->receive_queue);
1463 if (skb1 == NULL)
1464 {1465 printk("INET: tcp.c:tcp_data memory leak detected.\n");
1466 break;
1467 }1468
1469 /*1470 * Don't throw out something that has been acked. 1471 */1472
1473 if (skb1->acked)
1474 {1475 break;
1476 }1477
1478 skb_unlink(skb1);
1479 kfree_skb(skb1, FREE_READ);
1480 }1481 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1482 sk->ack_backlog++;
1483 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1484 }1485
1486 /*1487 * Now tell the user we may have some data. 1488 */1489
1490 if (!sk->dead)
1491 {1492 if(sk->debug)
1493 printk("Data wakeup.\n");
1494 sk->data_ready(sk,0);
1495 }1496 return(0);
1497 }1498
1499
1500 /*1501 * This routine is only called when we have urgent data1502 * signalled. Its the 'slow' part of tcp_urg. It could be1503 * moved inline now as tcp_urg is only called from one1504 * place. We handle URGent data wrong. We have to - as1505 * BSD still doesn't use the correction from RFC961.1506 */1507
1508 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */1509 {1510 u32ptr = ntohs(th->urg_ptr);
1511
1512 if (ptr)
1513 ptr--;
1514 ptr += ntohl(th->seq);
1515
1516 /* ignore urgent data that we've already seen and read */1517 if (after(sk->copied_seq, ptr))
1518 return;
1519
1520 /* do we already have a newer (or duplicate) urgent pointer? */1521 if (sk->urg_data && !after(ptr, sk->urg_seq))
1522 return;
1523
1524 /* tell the world about our new urgent pointer */1525 if (sk->proc != 0) {1526 if (sk->proc > 0) {1527 kill_proc(sk->proc, SIGURG, 1);
1528 }else{1529 kill_pg(-sk->proc, SIGURG, 1);
1530 }1531 }1532 sk->urg_data = URG_NOTYET;
1533 sk->urg_seq = ptr;
1534 }1535
1536 /*1537 * This is the 'fast' part of urgent handling.1538 */1539
1540 staticinlinevoidtcp_urg(structsock *sk, structtcphdr *th, unsignedlonglen)
/* */1541 {1542 /*1543 * Check if we get a new urgent pointer - normally not 1544 */1545
1546 if (th->urg)
1547 tcp_check_urg(sk,th);
1548
1549 /*1550 * Do we wait for any urgent data? - normally not1551 */1552
1553 if (sk->urg_data == URG_NOTYET) {1554 u32ptr;
1555
1556 /*1557 * Is the urgent pointer pointing into this packet? 1558 */1559 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1560 if (ptr < len) {1561 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
1562 if (!sk->dead)
1563 sk->data_ready(sk,0);
1564 }1565 }1566 }1567
1568
1569 /*1570 * A TCP packet has arrived.1571 * skb->h.raw is the TCP header.1572 */1573
1574 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */1575 __u32daddr, unsignedshortlen,
1576 __u32saddr, intredo, structinet_protocol * protocol)
1577 {1578 structtcphdr *th;
1579 structsock *sk;
1580 intsyn_ok=0;
1581
1582 /*1583 * "redo" is 1 if we have already seen this skb but couldn't1584 * use it at that time (the socket was locked). In that case1585 * we have already done a lot of the work (looked up the socket1586 * etc).1587 */1588 th = skb->h.th;
1589 sk = skb->sk;
1590 if (!redo) {1591 tcp_statistics.TcpInSegs++;
1592 if (skb->pkt_type!=PACKET_HOST)
1593 gotodiscard_it;
1594
1595 /*1596 * Pull up the IP header.1597 */1598
1599 skb_pull(skb, skb->h.raw-skb->data);
1600
1601 /*1602 * Try to use the device checksum if provided.1603 */1604 switch (skb->ip_summed)
1605 {1606 caseCHECKSUM_NONE:
1607 skb->csum = csum_partial((char *)th, len, 0);
1608 caseCHECKSUM_HW:
1609 if (tcp_check(th, len, saddr, daddr, skb->csum))
1610 gotodiscard_it;
1611 default:
1612 /* CHECKSUM_UNNECESSARY */1613 }1614 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1615 if (!sk)
1616 gotono_tcp_socket;
1617 skb->sk = sk;
1618 skb->seq = ntohl(th->seq);
1619 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1620 skb->ack_seq = ntohl(th->ack_seq);
1621
1622 skb->acked = 0;
1623 skb->used = 0;
1624 skb->free = 1;
1625 skb->saddr = daddr;
1626 skb->daddr = saddr;
1627
1628 /* We may need to add it to the backlog here. */1629 if (sk->users)
1630 {1631 skb_queue_tail(&sk->back_log, skb);
1632 return(0);
1633 }1634 }1635
1636 /*1637 * If this socket has got a reset it's to all intents and purposes 1638 * really dead. Count closed sockets as dead.1639 *1640 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD1641 * simply drops data. This seems incorrect as a 'closed' TCP doesn't1642 * exist so should cause resets as if the port was unreachable.1643 */1644
1645 if (sk->zapped || sk->state==TCP_CLOSE)
1646 gotono_tcp_socket;
1647
1648 if (!sk->prot)
1649 {1650 printk("IMPOSSIBLE 3\n");
1651 return(0);
1652 }1653
1654
1655 /*1656 * Charge the memory to the socket. 1657 */1658
1659 skb->sk=sk;
1660 sk->rmem_alloc += skb->truesize;
1661
1662 /*1663 * We should now do header prediction.1664 */1665
1666 /*1667 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We1668 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug1669 * compatibility. We also set up variables more thoroughly [Karn notes in the1670 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].1671 */1672
1673 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */1674 {1675
1676 /*1677 * Now deal with unusual cases.1678 */1679
1680 if(sk->state==TCP_LISTEN)
1681 {1682 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */1683 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1684
1685 /*1686 * We don't care for RST, and non SYN are absorbed (old segments)1687 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the1688 * netmask on a running connection it can go broadcast. Even Sun's have1689 * this problem so I'm ignoring it 1690 */1691
1692 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1693 {1694 kfree_skb(skb, FREE_READ);
1695 return 0;
1696 }1697
1698 /* 1699 * Guess we need to make a new socket up 1700 */1701
1702 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1703
1704 /*1705 * Now we have several options: In theory there is nothing else1706 * in the frame. KA9Q has an option to send data with the syn,1707 * BSD accepts data with the syn up to the [to be] advertised window1708 * and Solaris 2.1 gives you a protocol error. For now we just ignore1709 * it, that fits the spec precisely and avoids incompatibilities. It1710 * would be nice in future to drop through and process the data.1711 *1712 * Now TTCP is starting to use we ought to queue this data.1713 */1714
1715 return 0;
1716 }1717
1718 /* 1719 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN1720 * then its a new connection1721 */1722
1723 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1724 {1725 kfree_skb(skb, FREE_READ);
1726 return 0;
1727 }1728
1729 /*1730 * SYN sent means we have to look for a suitable ack and either reset1731 * for bad matches or go to connected. The SYN_SENT case is unusual and should1732 * not be in line code. [AC]1733 */1734
1735 if(sk->state==TCP_SYN_SENT)
1736 {1737 /* Crossed SYN or previous junk segment */1738 if(th->ack)
1739 {1740 /* We got an ack, but it's not a good ack */1741 if(!tcp_ack(sk,th,skb->ack_seq,len))
1742 {1743 /* Reset the ack - its an ack from a 1744 different connection [ th->rst is checked in tcp_send_reset()] */1745 tcp_statistics.TcpAttemptFails++;
1746 tcp_send_reset(daddr, saddr, th,
1747 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1748 kfree_skb(skb, FREE_READ);
1749 return(0);
1750 }1751 if(th->rst)
1752 returntcp_reset(sk,skb);
1753 if(!th->syn)
1754 {1755 /* A valid ack from a different connection1756 start. Shouldn't happen but cover it */1757 tcp_statistics.TcpAttemptFails++;
1758 tcp_send_reset(daddr, saddr, th,
1759 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1760 kfree_skb(skb, FREE_READ);
1761 return 0;
1762 }1763 /*1764 * Ok.. it's good. Set up sequence numbers and1765 * move to established.1766 */1767 syn_ok=1; /* Don't reset this connection for the syn */1768 sk->acked_seq = skb->seq+1;
1769 sk->lastwin_seq = skb->seq+1;
1770 sk->fin_seq = skb->seq;
1771 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1772 tcp_set_state(sk, TCP_ESTABLISHED);
1773 tcp_options(sk,th);
1774 sk->dummy_th.dest=th->source;
1775 sk->copied_seq = sk->acked_seq;
1776 if(!sk->dead)
1777 {1778 sk->state_change(sk);
1779 sock_wake_async(sk->socket, 0);
1780 }1781 if(sk->max_window==0)
1782 {1783 sk->max_window = 32;
1784 sk->mss = min(sk->max_window, sk->mtu);
1785 }1786 }1787 else1788 {1789 /* See if SYN's cross. Drop if boring */1790 if(th->syn && !th->rst)
1791 {1792 /* Crossed SYN's are fine - but talking to1793 yourself is right out... */1794 if(sk->saddr==saddr && sk->daddr==daddr &&
1795 sk->dummy_th.source==th->source &&
1796 sk->dummy_th.dest==th->dest)
1797 {1798 tcp_statistics.TcpAttemptFails++;
1799 returntcp_reset(sk,skb);
1800 }1801 tcp_set_state(sk,TCP_SYN_RECV);
1802
1803 /*1804 * FIXME:1805 * Must send SYN|ACK here1806 */1807 }1808 /* Discard junk segment */1809 kfree_skb(skb, FREE_READ);
1810 return 0;
1811 }1812 /*1813 * SYN_RECV with data maybe.. drop through1814 */1815 gotorfc_step6;
1816 }1817
1818 /*1819 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is1820 * a more complex suggestion for fixing these reuse issues in RFC16441821 * but not yet ready for general use. Also see RFC1379.1822 *1823 * Note the funny way we go back to the top of this function for1824 * this case ("goto try_next_socket"). That also takes care of1825 * checking "sk->users" for the new socket as well as doing all1826 * the normal tests on the packet.1827 */1828
1829 #defineBSD_TIME_WAIT1830 #ifdefBSD_TIME_WAIT1831 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1832 after(skb->seq, sk->acked_seq) && !th->rst)
1833 {1834 u32seq = sk->write_seq;
1835 if(sk->debug)
1836 printk("Doing a BSD time wait\n");
1837 tcp_statistics.TcpEstabResets++;
1838 sk->rmem_alloc -= skb->truesize;
1839 skb->sk = NULL;
1840 sk->err=ECONNRESET;
1841 tcp_set_state(sk, TCP_CLOSE);
1842 sk->shutdown = SHUTDOWN_MASK;
1843 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1844 /* this is not really correct: we should check sk->users */1845 if (sk && sk->state==TCP_LISTEN)
1846 {1847 skb->sk = sk;
1848 sk->rmem_alloc += skb->truesize;
1849 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1850 return 0;
1851 }1852 kfree_skb(skb, FREE_READ);
1853 return 0;
1854 }1855 #endif1856 }1857
1858 /*1859 * We are now in normal data flow (see the step list in the RFC)1860 * Note most of these are inline now. I'll inline the lot when1861 * I have time to test it hard and look at what gcc outputs 1862 */1863
1864 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1865 {1866 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1867 kfree_skb(skb, FREE_READ);
1868 return 0;
1869 }1870
1871 if(th->rst)
1872 returntcp_reset(sk,skb);
1873
1874 /*1875 * !syn_ok is effectively the state test in RFC793.1876 */1877
1878 if(th->syn && !syn_ok)
1879 {1880 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1881 returntcp_reset(sk,skb);
1882 }1883
1884 tcp_delack_estimator(sk);
1885
1886 /*1887 * Process the ACK1888 */1889
1890
1891 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1892 {1893 /*1894 * Our three way handshake failed.1895 */1896
1897 if(sk->state==TCP_SYN_RECV)
1898 {1899 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1900 }1901 kfree_skb(skb, FREE_READ);
1902 return 0;
1903 }1904
1905 rfc_step6: /* I'll clean this up later */1906
1907 /*1908 * If the accepted buffer put us over our queue size we1909 * now drop it (we must process the ack first to avoid1910 * deadlock cases).1911 */1912
1913 if (sk->rmem_alloc >= sk->rcvbuf)
1914 {1915 kfree_skb(skb, FREE_READ);
1916 return(0);
1917 }1918
1919
1920 /*1921 * Process urgent data1922 */1923
1924 tcp_urg(sk, th, len);
1925
1926 /*1927 * Process the encapsulated data1928 */1929
1930 if(tcp_data(skb,sk, saddr, len))
1931 kfree_skb(skb, FREE_READ);
1932
1933 /*1934 * And done1935 */1936
1937 return 0;
1938
1939 no_tcp_socket:
1940 /*1941 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)1942 */1943 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1944
1945 discard_it:
1946 /*1947 * Discard frame1948 */1949 skb->sk = NULL;
1950 kfree_skb(skb, FREE_READ);
1951 return 0;
1952 }