1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp_input.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * FIXES 23 * Pedro Roque : Double ACK bug 24 */ 25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 /* 30 * Policy code extracted so it's now separate 31 */ 32
33 /* 34 * Called each time to estimate the delayed ack timeout. This is 35 * how it should be done so a fast link isn't impacted by ack delay. 36 */ 37
38 extern__inline__voidtcp_delack_estimator(structsock *sk)
/* */ 39 { 40 /* 41 * Delayed ACK time estimator. 42 */ 43
44 if (sk->lrcvtime == 0)
45 { 46 sk->lrcvtime = jiffies;
47 sk->ato = HZ/3;
48 } 49 else 50 { 51 intm;
52
53 m = jiffies - sk->lrcvtime;
54
55 sk->lrcvtime = jiffies;
56
57 if (m <= 0)
58 m = 1;
59
60 if (m > (sk->rtt >> 3))
61 { 62 sk->ato = sk->rtt >> 3;
63 /* 64 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); 65 */ 66 } 67 else 68 { 69 sk->ato = (sk->ato >> 1) + m;
70 /* 71 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); 72 */ 73 } 74 } 75 } 76
77 /* 78 * Called on frames that were known _not_ to have been 79 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 80 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. 81 */ 82
83 extern__inline__voidtcp_rtt_estimator(structsock *sk, structsk_buff *oskb)
/* */ 84 { 85 longm;
86 /* 87 * The following amusing code comes from Jacobson's 88 * article in SIGCOMM '88. Note that rtt and mdev 89 * are scaled versions of rtt and mean deviation. 90 * This is designed to be as fast as possible 91 * m stands for "measurement". 92 */ 93
94 m = jiffies - oskb->when; /* RTT */ 95 if (sk->rtt != 0) { 96 if(m<=0)
97 m=1; /* IS THIS RIGHT FOR <0 ??? */ 98 m -= (sk->rtt >> 3); /* m is now error in rtt est */ 99 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ 100 if (m < 0)
101 m = -m; /* m is now abs(error) */ 102 m -= (sk->mdev >> 2); /* similar update on mdev */ 103 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 104 }else{ 105 /* no previous measure. */ 106 sk->rtt = m<<3; /* take the measured time to be rtt */ 107 sk->mdev = m<<2; /* make sure rto = 3*rtt */ 108 } 109
110 /* 111 * Now update timeout. Note that this removes any backoff. 112 */ 113
114 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
115 if (sk->rto > 120*HZ)
116 sk->rto = 120*HZ;
117 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ 118 sk->rto = HZ/5;
119 sk->backoff = 0;
120 } 121
122 /* 123 * Cached last hit socket 124 */ 125
126 staticvolatileunsignedlongth_cache_saddr, th_cache_daddr;
127 staticvolatileunsignedshortth_cache_dport, th_cache_sport;
128 staticvolatilestructsock *th_cache_sk;
129
130 voidtcp_cache_zap(void)
/* */ 131 { 132 th_cache_sk=NULL;
133 } 134
135 /* 136 * Find the socket, using the last hit cache if applicable. The cache is not quite 137 * right... 138 */ 139
140 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */ 141 { 142 structsock * sk;
143
144 sk = (structsock *) th_cache_sk;
145 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
146 sport != th_cache_sport || dport != th_cache_dport) { 147 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
148 if (sk) { 149 th_cache_saddr=saddr;
150 th_cache_daddr=daddr;
151 th_cache_dport=dport;
152 th_cache_sport=sport;
153 th_cache_sk=sk;
154 } 155 } 156 returnsk;
157 } 158
159 /* 160 * React to a out-of-window TCP sequence number in an incoming packet 161 */ 162
163 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, u32end_seq,
/* */ 164 structdevice *dev)
165 { 166 if (th->rst)
167 return;
168
169 /* 170 * Send a reset if we get something not ours and we are 171 * unsynchronized. Note: We don't do anything to our end. We 172 * are just killing the bogus remote connection then we will 173 * connect again and it will work (with luck). 174 */ 175
176 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
177 { 178 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
179 return;
180 } 181
182 /* 183 * 4.3reno machines look for these kind of acks so they can do fast 184 * recovery. Three identical 'old' acks lets it know that one frame has 185 * been lost and should be resent. Because this is before the whole window 186 * of data has timed out it can take one lost frame per window without 187 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2] 188 */ 189 tcp_send_ack(sk);
190 } 191
192 /* 193 * This functions checks to see if the tcp header is actually acceptable. 194 */ 195
196 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */ 197 { 198 u32end_window = sk->acked_seq + sk->window;
199 return/* if start is at end of window, end must be too (zero window) */ 200 (seq == end_window && seq == end_seq) ||
201 /* if start is before end of window, check for interest */ 202 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
203 } 204
205 /* 206 * When we get a reset we do this. This probably is a tcp_output routine 207 * really. 208 */ 209
210 staticinttcp_reset(structsock *sk, structsk_buff *skb)
/* */ 211 { 212 sk->zapped = 1;
213 /* 214 * We want the right error as BSD sees it (and indeed as we do). 215 */ 216 sk->err = ECONNRESET;
217 if (sk->state == TCP_SYN_SENT)
218 sk->err = ECONNREFUSED;
219 if (sk->state == TCP_CLOSE_WAIT)
220 sk->err = EPIPE;
221 #ifdef CONFIG_TCP_RFC1337
222 /* 223 * Time wait assassination protection [RFC1337] 224 * 225 * This is a good idea, but causes more sockets to take time to close. 226 * 227 * Ian Heavens has since shown this is an inadequate fix for the protocol 228 * bug in question. 229 */ 230 if(sk->state!=TCP_TIME_WAIT)
231 { 232 tcp_set_state(sk,TCP_CLOSE);
233 sk->shutdown = SHUTDOWN_MASK;
234 } 235 #else 236 tcp_set_state(sk,TCP_CLOSE);
237 sk->shutdown = SHUTDOWN_MASK;
238 #endif 239 if (!sk->dead)
240 sk->state_change(sk);
241 kfree_skb(skb, FREE_READ);
242 return(0);
243 } 244
245
246 /* 247 * Look for tcp options. Parses everything but only knows about MSS. 248 * This routine is always called with the packet containing the SYN. 249 * However it may also be called with the ack to the SYN. So you 250 * can't assume this is always the SYN. It's always called after 251 * we have set up sk->mtu to our own MTU. 252 * 253 * We need at minimum to add PAWS support here. Possibly large windows 254 * as Linux gets deployed on 100Mb/sec networks. 255 */ 256
257 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */ 258 { 259 unsignedchar *ptr;
260 intlength=(th->doff*4)-sizeof(structtcphdr);
261 intmss_seen = 0;
262
263 ptr = (unsignedchar *)(th + 1);
264
265 while(length>0)
266 { 267 intopcode=*ptr++;
268 intopsize=*ptr++;
269 switch(opcode)
270 { 271 caseTCPOPT_EOL:
272 return;
273 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 274 length--;
275 ptr--; /* the opsize=*ptr++ above was a mistake */ 276 continue;
277
278 default:
279 if(opsize<=2) /* Avoid silly options looping forever */ 280 return;
281 switch(opcode)
282 { 283 caseTCPOPT_MSS:
284 if(opsize==4 && th->syn)
285 { 286 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
287 mss_seen = 1;
288 } 289 break;
290 /* Add other options here as people feel the urge to implement stuff like large windows */ 291 } 292 ptr+=opsize-2;
293 length-=opsize;
294 } 295 } 296 if (th->syn)
297 { 298 if (! mss_seen)
299 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ 300 } 301 #ifdefCONFIG_INET_PCTCP 302 sk->mss = min(sk->max_window >> 1, sk->mtu);
303 #else 304 sk->mss = min(sk->max_window, sk->mtu);
305 sk->max_unacked = 2 * sk->mss;
306 #endif 307 } 308
309
310 /* 311 * This routine handles a connection request. 312 * It should make sure we haven't already responded. 313 * Because of the way BSD works, we have to send a syn/ack now. 314 * This also means it will be harder to close a socket which is 315 * listening. 316 */ 317
318 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */ 319 u32daddr, u32saddr, structoptions *opt, structdevice *dev, u32seq)
320 { 321 structsock *newsk;
322 structtcphdr *th;
323 structrtable *rt;
324
325 th = skb->h.th;
326
327 /* If the socket is dead, don't accept the connection. */ 328 if (!sk->dead)
329 { 330 sk->data_ready(sk,0);
331 } 332 else 333 { 334 if(sk->debug)
335 printk("Reset on %p: Connect on dead socket.\n",sk);
336 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
337 tcp_statistics.TcpAttemptFails++;
338 kfree_skb(skb, FREE_READ);
339 return;
340 } 341
342 /* 343 * Make sure we can accept more. This will prevent a 344 * flurry of syns from eating up all our memory. 345 * 346 * BSD does some funnies here and allows 3/2 times the 347 * set backlog as a fudge factor. That's just too gross. 348 */ 349
350 if (sk->ack_backlog >= sk->max_ack_backlog)
351 { 352 tcp_statistics.TcpAttemptFails++;
353 kfree_skb(skb, FREE_READ);
354 return;
355 } 356
357 /* 358 * We need to build a new sock struct. 359 * It is sort of bad to have a socket without an inode attached 360 * to it, but the wake_up's will just wake up the listening socket, 361 * and if the listening socket is destroyed before this is taken 362 * off of the queue, this will take care of it. 363 */ 364
365 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
366 if (newsk == NULL)
367 { 368 /* just ignore the syn. It will get retransmitted. */ 369 tcp_statistics.TcpAttemptFails++;
370 kfree_skb(skb, FREE_READ);
371 return;
372 } 373
374 memcpy(newsk, sk, sizeof(*newsk));
375 newsk->opt = NULL;
376 newsk->ip_route_cache = NULL;
377 if (opt && opt->optlen)
378 { 379 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
380 if (!sk->opt)
381 { 382 kfree_s(newsk, sizeof(structsock));
383 tcp_statistics.TcpAttemptFails++;
384 kfree_skb(skb, FREE_READ);
385 return;
386 } 387 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
388 { 389 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
390 kfree_s(newsk, sizeof(structsock));
391 tcp_statistics.TcpAttemptFails++;
392 kfree_skb(skb, FREE_READ);
393 return;
394 } 395 } 396 skb_queue_head_init(&newsk->write_queue);
397 skb_queue_head_init(&newsk->receive_queue);
398 newsk->send_head = NULL;
399 newsk->send_tail = NULL;
400 skb_queue_head_init(&newsk->back_log);
401 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ 402 newsk->rto = TCP_TIMEOUT_INIT;
403 newsk->mdev = TCP_TIMEOUT_INIT<<1;
404 newsk->max_window = 0;
405 newsk->cong_window = 1;
406 newsk->cong_count = 0;
407 newsk->ssthresh = 0;
408 newsk->backoff = 0;
409 newsk->blog = 0;
410 newsk->intr = 0;
411 newsk->proc = 0;
412 newsk->done = 0;
413 newsk->partial = NULL;
414 newsk->pair = NULL;
415 newsk->wmem_alloc = 0;
416 newsk->rmem_alloc = 0;
417 newsk->localroute = sk->localroute;
418
419 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
420
421 newsk->err = 0;
422 newsk->shutdown = 0;
423 newsk->ack_backlog = 0;
424 newsk->acked_seq = skb->seq+1;
425 newsk->lastwin_seq = skb->seq+1;
426 newsk->delay_acks = 1;
427 newsk->copied_seq = skb->seq+1;
428 newsk->fin_seq = skb->seq;
429 newsk->state = TCP_SYN_RECV;
430 newsk->timeout = 0;
431 newsk->ip_xmit_timeout = 0;
432 newsk->write_seq = seq;
433 newsk->window_seq = newsk->write_seq;
434 newsk->rcv_ack_seq = newsk->write_seq;
435 newsk->urg_data = 0;
436 newsk->retransmits = 0;
437 newsk->linger=0;
438 newsk->destroy = 0;
439 init_timer(&newsk->timer);
440 newsk->timer.data = (unsignedlong)newsk;
441 newsk->timer.function = &net_timer;
442 init_timer(&newsk->delack_timer);
443 newsk->delack_timer.data = (unsignedlong)newsk;
444 newsk->delack_timer.function = tcp_delack_timer;
445 init_timer(&newsk->retransmit_timer);
446 newsk->retransmit_timer.data = (unsignedlong)newsk;
447 newsk->retransmit_timer.function = tcp_retransmit_timer;
448 newsk->dummy_th.source = skb->h.th->dest;
449 newsk->dummy_th.dest = skb->h.th->source;
450
451 /* 452 * Swap these two, they are from our point of view. 453 */ 454
455 newsk->daddr = saddr;
456 newsk->saddr = daddr;
457 newsk->rcv_saddr = daddr;
458
459 put_sock(newsk->num,newsk);
460 newsk->acked_seq = skb->seq + 1;
461 newsk->copied_seq = skb->seq + 1;
462 newsk->socket = NULL;
463
464 /* 465 * Grab the ttl and tos values and use them 466 */ 467
468 newsk->ip_ttl=sk->ip_ttl;
469 newsk->ip_tos=skb->ip_hdr->tos;
470
471 /* 472 * Use 512 or whatever user asked for 473 */ 474
475 /* 476 * Note use of sk->user_mss, since user has no direct access to newsk 477 */ 478
479 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
480 newsk->ip_route_cache = rt;
481
482 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
483 newsk->window_clamp = rt->rt_window;
484 else 485 newsk->window_clamp = 0;
486
487 if (sk->user_mss)
488 newsk->mtu = sk->user_mss;
489 elseif (rt)
490 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
491 else 492 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
493
494 /* 495 * But not bigger than device MTU 496 */ 497
498 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
499
500 #ifdefCONFIG_SKIP 501
502 /* 503 * SKIP devices set their MTU to 65535. This is so they can take packets 504 * unfragmented to security process then fragment. They could lie to the 505 * TCP layer about a suitable MTU, but it's easier to let skip sort it out 506 * simply because the final package we want unfragmented is going to be 507 * 508 * [IPHDR][IPSP][Security data][Modified TCP data][Security data] 509 */ 510
511 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ 512 sk->mtu=skip_pick_mtu(sk->mtu,dev);
513 #endif 514 /* 515 * This will min with what arrived in the packet 516 */ 517
518 tcp_options(newsk,skb->h.th);
519
520 tcp_cache_zap();
521 tcp_send_synack(newsk, sk, skb);
522 } 523
524
525 /* 526 * Handle a TCP window that shrunk on us. It shouldn't happen, 527 * but.. 528 * 529 * We may need to move packets from the send queue 530 * to the write queue, if the window has been shrunk on us. 531 * The RFC says you are not allowed to shrink your window 532 * like this, but if the other end does, you must be able 533 * to deal with it. 534 */ 535 voidtcp_window_shrunk(structsock * sk, u32window_seq)
/* */ 536 { 537 structsk_buff *skb;
538 structsk_buff *skb2;
539 structsk_buff *wskb = NULL;
540
541 skb2 = sk->send_head;
542 sk->send_head = NULL;
543 sk->send_tail = NULL;
544
545 /* 546 * This is an artifact of a flawed concept. We want one 547 * queue and a smarter send routine when we send all. 548 */ 549 cli();
550 while (skb2 != NULL)
551 { 552 skb = skb2;
553 skb2 = skb->link3;
554 skb->link3 = NULL;
555 if (after(skb->end_seq, window_seq))
556 { 557 if (sk->packets_out > 0)
558 sk->packets_out--;
559 /* We may need to remove this from the dev send list. */ 560 if (skb->next != NULL)
561 { 562 skb_unlink(skb);
563 } 564 /* Now add it to the write_queue. */ 565 if (wskb == NULL)
566 skb_queue_head(&sk->write_queue,skb);
567 else 568 skb_append(wskb,skb);
569 wskb = skb;
570 } 571 else 572 { 573 if (sk->send_head == NULL)
574 { 575 sk->send_head = skb;
576 sk->send_tail = skb;
577 } 578 else 579 { 580 sk->send_tail->link3 = skb;
581 sk->send_tail = skb;
582 } 583 skb->link3 = NULL;
584 } 585 } 586 sti();
587 } 588
589
590 /* 591 * This routine deals with incoming acks, but not outgoing ones. 592 * 593 * This routine is totally _WRONG_. The list structuring is wrong, 594 * the algorithm is wrong, the code is wrong. 595 */ 596
597 staticinttcp_ack(structsock *sk, structtcphdr *th, u32ack, intlen)
/* */ 598 { 599 intflag = 0;
600 u32window_seq;
601
602 /* 603 * 1 - there was data in packet as well as ack or new data is sent or 604 * in shutdown state 605 * 2 - data from retransmit queue was acked and removed 606 * 4 - window shrunk or data from retransmit queue was acked and removed 607 */ 608
609 if(sk->zapped)
610 return(1); /* Dead, can't ack any more so why bother */ 611
612 /* 613 * We have dropped back to keepalive timeouts. Thus we have 614 * no retransmits pending. 615 */ 616
617 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
618 sk->retransmits = 0;
619
620 /* 621 * If the ack is newer than sent or older than previous acks 622 * then we can probably ignore it. 623 */ 624
625 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
626 gotouninteresting_ack;
627
628 /* 629 * If there is data set flag 1 630 */ 631
632 if (len != th->doff*4)
633 flag |= 1;
634
635 /* 636 * Have we discovered a larger window 637 */ 638 window_seq = ntohs(th->window);
639 if (window_seq > sk->max_window)
640 { 641 sk->max_window = window_seq;
642 #ifdefCONFIG_INET_PCTCP 643 /* Hack because we don't send partial packets to non SWS 644 handling hosts */ 645 sk->mss = min(window_seq>>1, sk->mtu);
646 #else 647 sk->mss = min(window_seq, sk->mtu);
648 #endif 649 } 650 window_seq += ack;
651
652 /* 653 * See if our window has been shrunk. 654 */ 655 if (after(sk->window_seq, window_seq)) { 656 flag |= 4;
657 tcp_window_shrunk(sk, window_seq);
658 } 659
660 /* 661 * Pipe has emptied 662 */ 663 if (sk->send_tail == NULL || sk->send_head == NULL)
664 { 665 sk->send_head = NULL;
666 sk->send_tail = NULL;
667 sk->packets_out= 0;
668 } 669
670 /* 671 * We don't want too many packets out there. 672 */ 673
674 if (sk->ip_xmit_timeout == TIME_WRITE &&
675 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
676 { 677
678 /* 679 * This is Jacobson's slow start and congestion avoidance. 680 * SIGCOMM '88, p. 328. Because we keep cong_window in integral 681 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 682 * counter and increment it once every cwnd times. It's possible 683 * that this should be done only if sk->retransmits == 0. I'm 684 * interpreting "new data is acked" as including data that has 685 * been retransmitted but is just now being acked. 686 */ 687 if (sk->cong_window < sk->ssthresh)
688 /* 689 * In "safe" area, increase 690 */ 691 sk->cong_window++;
692 else 693 { 694 /* 695 * In dangerous area, increase slowly. In theory this is 696 * sk->cong_window += 1 / sk->cong_window 697 */ 698 if (sk->cong_count >= sk->cong_window)
699 { 700 sk->cong_window++;
701 sk->cong_count = 0;
702 } 703 else 704 sk->cong_count++;
705 } 706 } 707
708 /* 709 * Remember the highest ack received and update the 710 * right hand window edge of the host. 711 * We do a bit of work here to track number of times we've 712 * seen this ack without a change in the right edge of the 713 * window and no data in the packet. 714 * This will allow us to do fast retransmits. 715 */ 716
717 /* We are looking for duplicate ACKs here. 718 * An ACK is a duplicate if: 719 * (1) it has the same sequence number as the largest number we've seen, 720 * (2) it has the same window as the last ACK, 721 * (3) we have outstanding data that has not been ACKed 722 * (4) The packet was not carrying any data. 723 * I've tried to order these in occurrence of most likely to fail 724 * to least likely to fail. 725 * [These are the rules BSD stacks use to determine if an ACK is a 726 * duplicate.] 727 */ 728
729 if (sk->rcv_ack_seq == ack 730 && sk->window_seq == window_seq 731 && !(flag&1)
732 && before(ack, sk->sent_seq))
733 { 734 /* See draft-stevens-tcpca-spec-01 for explanation 735 * of what we are doing here. 736 */ 737 sk->rcv_ack_cnt++;
738 if (sk->rcv_ack_cnt == MAX_DUP_ACKS+1) { 739 sk->ssthresh = max(sk->cong_window >> 1, 2);
740 sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1;
741 tcp_do_retransmit(sk,0);
742 /* reduce the count. We don't want to be 743 * seen to be in "retransmit" mode if we 744 * are doing a fast retransmit. 745 */ 746 sk->retransmits--;
747 }elseif (sk->rcv_ack_cnt > MAX_DUP_ACKS+1) { 748 sk->cong_window++;
749 /* 750 * At this point we are suppose to transmit a NEW 751 * packet (not retransmit the missing packet, 752 * this would only get us into a retransmit war.) 753 * I think that having just adjusted cong_window 754 * we will transmit the new packet below. 755 */ 756 } 757 } 758 else 759 { 760 if (sk->rcv_ack_cnt > MAX_DUP_ACKS) { 761 sk->cong_window = sk->ssthresh;
762 } 763 sk->window_seq = window_seq;
764 sk->rcv_ack_seq = ack;
765 sk->rcv_ack_cnt = 1;
766 } 767
768 /* 769 * We passed data and got it acked, remove any soft error 770 * log. Something worked... 771 */ 772
773 sk->err_soft = 0;
774
775 /* 776 * If this ack opens up a zero window, clear backoff. It was 777 * being used to time the probes, and is probably far higher than 778 * it needs to be for normal retransmission. 779 */ 780
781 if (sk->ip_xmit_timeout == TIME_PROBE0)
782 { 783 sk->retransmits = 0; /* Our probe was answered */ 784
785 /* 786 * Was it a usable window open ? 787 */ 788
789 if (!skb_queue_empty(&sk->write_queue) && /* should always be true */ 790 ! before (sk->window_seq, sk->write_queue.next->end_seq))
791 { 792 sk->backoff = 0;
793
794 /* 795 * Recompute rto from rtt. this eliminates any backoff. 796 */ 797
798 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
799 if (sk->rto > 120*HZ)
800 sk->rto = 120*HZ;
801 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about 802 .2 of a second because of BSD delayed acks - on a 100Mb/sec link 803 .2 of a second is going to need huge windows (SIGH) */ 804 sk->rto = HZ/5;
805 } 806 } 807
808 /* 809 * See if we can take anything off of the retransmit queue. 810 */ 811
812 for (;;) { 813 structsk_buff * skb = sk->send_head;
814 if (!skb)
815 break;
816
817 /* Check for a bug. */ 818 if (skb->link3 && after(skb->end_seq, skb->link3->end_seq))
819 printk("INET: tcp.c: *** bug send_list out of order.\n");
820
821 /* 822 * If our packet is before the ack sequence we can 823 * discard it as it's confirmed to have arrived the other end. 824 */ 825
826 if (after(skb->end_seq, ack))
827 break;
828
829 if (sk->retransmits)
830 { 831 /* 832 * We were retransmitting. don't count this in RTT est 833 */ 834 flag |= 2;
835 } 836
837 if ((sk->send_head = skb->link3) == NULL)
838 { 839 sk->send_tail = NULL;
840 sk->retransmits = 0;
841 } 842 /* 843 * Note that we only reset backoff and rto in the 844 * rtt recomputation code. And that doesn't happen 845 * if there were retransmissions in effect. So the 846 * first new packet after the retransmissions is 847 * sent with the backoff still in effect. Not until 848 * we get an ack from a non-retransmitted packet do 849 * we reset the backoff and rto. This allows us to deal 850 * with a situation where the network delay has increased 851 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) 852 */ 853
854 /* 855 * We have one less packet out there. 856 */ 857
858 if (sk->packets_out > 0)
859 sk->packets_out --;
860
861 if (!(flag&2)) /* Not retransmitting */ 862 tcp_rtt_estimator(sk,skb);
863 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 864 In this case as we just set it up */ 865 IS_SKB(skb);
866
867 /* 868 * We may need to remove this from the dev send list. 869 */ 870 cli();
871 if (skb->next)
872 skb_unlink(skb);
873 sti();
874 kfree_skb(skb, FREE_WRITE); /* write. */ 875 if (!sk->dead)
876 sk->write_space(sk);
877 } 878
879 /* 880 * XXX someone ought to look at this too.. at the moment, if skb_peek() 881 * returns non-NULL, we complete ignore the timer stuff in the else 882 * clause. We ought to organize the code so that else clause can 883 * (should) be executed regardless, possibly moving the PROBE timer 884 * reset over. The skb_peek() thing should only move stuff to the 885 * write queue, NOT also manage the timer functions. 886 */ 887
888 /* 889 * Maybe we can take some stuff off of the write queue, 890 * and put it onto the xmit queue. 891 */ 892 if (skb_peek(&sk->write_queue) != NULL)
893 { 894 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
895 (sk->retransmits == 0 ||
896 sk->ip_xmit_timeout != TIME_WRITE ||
897 !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
898 && sk->packets_out < sk->cong_window)
899 { 900 /* 901 * Add more data to the send queue. 902 */ 903 flag |= 1;
904 tcp_write_xmit(sk);
905 } 906 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
907 sk->send_head == NULL &&
908 sk->ack_backlog == 0 &&
909 sk->state != TCP_TIME_WAIT)
910 { 911 /* 912 * Data to queue but no room. 913 */ 914 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
915 } 916 } 917 else 918 { 919 /* 920 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets 921 * from TCP_CLOSE we don't do anything 922 * 923 * from anything else, if there is write data (or fin) pending, 924 * we use a TIME_WRITE timeout, else if keepalive we reset to 925 * a KEEPALIVE timeout, else we delete the timer. 926 * 927 * We do not set flag for nominal write data, otherwise we may 928 * force a state where we start to write itsy bitsy tidbits 929 * of data. 930 */ 931
932 switch(sk->state) { 933 caseTCP_TIME_WAIT:
934 /* 935 * keep us in TIME_WAIT until we stop getting packets, 936 * reset the timeout. 937 */ 938 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
939 break;
940 caseTCP_CLOSE:
941 /* 942 * don't touch the timer. 943 */ 944 break;
945 default:
946 /* 947 * Must check send_head and write_queue 948 * to determine which timeout to use. 949 */ 950 if (sk->send_head || !skb_queue_empty(&sk->write_queue)) { 951 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
952 }elseif (sk->keepopen) { 953 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
954 }else{ 955 del_timer(&sk->retransmit_timer);
956 sk->ip_xmit_timeout = 0;
957 } 958 break;
959 } 960 } 961
962 /* 963 * We have nothing queued but space to send. Send any partial 964 * packets immediately (end of Nagle rule application). 965 */ 966
967 if (sk->packets_out == 0
968 && sk->partial != NULL 969 && skb_queue_empty(&sk->write_queue)
970 && sk->send_head == NULL)
971 { 972 flag |= 1;
973 tcp_send_partial(sk);
974 } 975
976 /* 977 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and 978 * we are now waiting for an acknowledge to our FIN. The other end is 979 * already in TIME_WAIT. 980 * 981 * Move to TCP_CLOSE on success. 982 */ 983
984 if (sk->state == TCP_LAST_ACK)
985 { 986 if (!sk->dead)
987 sk->state_change(sk);
988 if(sk->debug)
989 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
990 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
991 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
992 { 993 flag |= 1;
994 sk->shutdown = SHUTDOWN_MASK;
995 tcp_set_state(sk,TCP_CLOSE);
996 return 1;
997 } 998 } 999
1000 /*1001 * Incoming ACK to a FIN we sent in the case of our initiating the close.1002 *1003 * Move to FIN_WAIT2 to await a FIN from the other end. Set1004 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.1005 */1006
1007 if (sk->state == TCP_FIN_WAIT1)
1008 {1009
1010 if (!sk->dead)
1011 sk->state_change(sk);
1012 if (sk->rcv_ack_seq == sk->write_seq)
1013 {1014 flag |= 1;
1015 sk->shutdown |= SEND_SHUTDOWN;
1016 tcp_set_state(sk, TCP_FIN_WAIT2);
1017 }1018 }1019
1020 /*1021 * Incoming ACK to a FIN we sent in the case of a simultaneous close.1022 *1023 * Move to TIME_WAIT1024 */1025
1026 if (sk->state == TCP_CLOSING)
1027 {1028
1029 if (!sk->dead)
1030 sk->state_change(sk);
1031 if (sk->rcv_ack_seq == sk->write_seq)
1032 {1033 flag |= 1;
1034 tcp_time_wait(sk);
1035 }1036 }1037
1038 /*1039 * Final ack of a three way shake 1040 */1041
1042 if(sk->state==TCP_SYN_RECV)
1043 {1044 tcp_set_state(sk, TCP_ESTABLISHED);
1045 tcp_options(sk,th);
1046 sk->dummy_th.dest=th->source;
1047 sk->copied_seq = sk->acked_seq;
1048 if(!sk->dead)
1049 sk->state_change(sk);
1050 if(sk->max_window==0)
1051 {1052 sk->max_window=32; /* Sanity check */1053 sk->mss=min(sk->max_window,sk->mtu);
1054 }1055 }1056
1057 /*1058 * I make no guarantees about the first clause in the following1059 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under1060 * what conditions "!flag" would be true. However I think the rest1061 * of the conditions would prevent that from causing any1062 * unnecessary retransmission. 1063 * Clearly if the first packet has expired it should be 1064 * retransmitted. The other alternative, "flag&2 && retransmits", is1065 * harder to explain: You have to look carefully at how and when the1066 * timer is set and with what timeout. The most recent transmission always1067 * sets the timer. So in general if the most recent thing has timed1068 * out, everything before it has as well. So we want to go ahead and1069 * retransmit some more. If we didn't explicitly test for this1070 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"1071 * would not be true. If you look at the pattern of timing, you can1072 * show that rto is increased fast enough that the next packet would1073 * almost never be retransmitted immediately. Then you'd end up1074 * waiting for a timeout to send each packet on the retransmission1075 * queue. With my implementation of the Karn sampling algorithm,1076 * the timeout would double each time. The net result is that it would1077 * take a hideous amount of time to recover from a single dropped packet.1078 * It's possible that there should also be a test for TIME_WRITE, but1079 * I think as long as "send_head != NULL" and "retransmit" is on, we've1080 * got to be in real retransmission mode.1081 * Note that tcp_do_retransmit is called with all==1. Setting cong_window1082 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.1083 * As long as no further losses occur, this seems reasonable.1084 */1085
1086 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1087 (((flag&2) && sk->retransmits) ||
1088 (sk->send_head->when + sk->rto < jiffies)))
1089 {1090 if(sk->send_head->when + sk->rto < jiffies)
1091 tcp_retransmit(sk,0);
1092 else1093 {1094 tcp_do_retransmit(sk, 1);
1095 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1096 }1097 }1098
1099 return 1;
1100
1101 uninteresting_ack:
1102 if(sk->debug)
1103 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1104
1105 /*1106 * Keepalive processing.1107 */1108
1109 if (after(ack, sk->sent_seq))
1110 {1111 return 0;
1112 }1113
1114 /*1115 * Restart the keepalive timer.1116 */1117
1118 if (sk->keepopen)
1119 {1120 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1121 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1122 }1123 return 1;
1124 }1125
1126
1127 /*1128 * Process the FIN bit. This now behaves as it is supposed to work1129 * and the FIN takes effect when it is validly part of sequence1130 * space. Not before when we get holes.1131 *1132 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT1133 * (and thence onto LAST-ACK and finally, CLOSE, we never enter1134 * TIME-WAIT)1135 *1136 * If we are in FINWAIT-1, a received FIN indicates simultaneous1137 * close and we go into CLOSING (and later onto TIME-WAIT)1138 *1139 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.1140 *1141 */1142
1143 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */1144 {1145 sk->fin_seq = skb->end_seq;
1146
1147 if (!sk->dead)
1148 {1149 sk->state_change(sk);
1150 sock_wake_async(sk->socket, 1);
1151 }1152
1153 switch(sk->state)
1154 {1155 caseTCP_SYN_RECV:
1156 caseTCP_SYN_SENT:
1157 caseTCP_ESTABLISHED:
1158 /*1159 * move to CLOSE_WAIT, tcp_data() already handled1160 * sending the ack.1161 */1162 tcp_set_state(sk,TCP_CLOSE_WAIT);
1163 if (th->rst)
1164 sk->shutdown = SHUTDOWN_MASK;
1165 break;
1166
1167 caseTCP_CLOSE_WAIT:
1168 caseTCP_CLOSING:
1169 /*1170 * received a retransmission of the FIN, do1171 * nothing.1172 */1173 break;
1174 caseTCP_TIME_WAIT:
1175 /*1176 * received a retransmission of the FIN,1177 * restart the TIME_WAIT timer.1178 */1179 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1180 return(0);
1181 caseTCP_FIN_WAIT1:
1182 /*1183 * This case occurs when a simultaneous close1184 * happens, we must ack the received FIN and1185 * enter the CLOSING state.1186 *1187 * This causes a WRITE timeout, which will either1188 * move on to TIME_WAIT when we timeout, or resend1189 * the FIN properly (maybe we get rid of that annoying1190 * FIN lost hang). The TIME_WRITE code is already correct1191 * for handling this timeout.1192 */1193
1194 if(sk->ip_xmit_timeout != TIME_WRITE)
1195 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1196 tcp_set_state(sk,TCP_CLOSING);
1197 break;
1198 caseTCP_FIN_WAIT2:
1199 /*1200 * received a FIN -- send ACK and enter TIME_WAIT1201 */1202 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1203 sk->shutdown|=SHUTDOWN_MASK;
1204 tcp_set_state(sk,TCP_TIME_WAIT);
1205 break;
1206 caseTCP_CLOSE:
1207 /*1208 * already in CLOSE1209 */1210 break;
1211 default:
1212 tcp_set_state(sk,TCP_LAST_ACK);
1213
1214 /* Start the timers. */1215 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1216 return(0);
1217 }1218
1219 return(0);
1220 }1221
1222 /*1223 * Add a sk_buff to the TCP receive queue, calculating1224 * the ACK sequence as we go..1225 */1226 staticinlinevoidtcp_insert_skb(structsk_buff * skb, structsk_buff_head * list)
/* */1227 {1228 structsk_buff * prev, * next;
1229 u32seq;
1230
1231 /*1232 * Find where the new skb goes.. (This goes backwards,1233 * on the assumption that we get the packets in order)1234 */1235 seq = skb->seq;
1236 prev = list->prev;
1237 next = (structsk_buff *) list;
1238 for (;;) {1239 if (prev == (structsk_buff *) list || !after(prev->seq, seq))
1240 break;
1241 next = prev;
1242 prev = prev->prev;
1243 }1244 __skb_insert(skb, prev, next, list);
1245 }1246
1247 /*1248 * Called for each packet when we find a new ACK endpoint sequence in it1249 */1250 staticinlineu32tcp_queue_ack(structsk_buff * skb, structsock * sk)
/* */1251 {1252 /*1253 * When we ack the fin, we do the FIN 1254 * processing.1255 */1256 skb->acked = 1;
1257 if (skb->h.th->fin)
1258 tcp_fin(skb,sk,skb->h.th);
1259 returnskb->end_seq;
1260 }1261
1262 staticvoidtcp_queue(structsk_buff * skb, structsock * sk, structtcphdr *th)
/* */1263 {1264 u32ack_seq;
1265
1266 tcp_insert_skb(skb, &sk->receive_queue);
1267
1268 /*1269 * Did we get anything new to ack?1270 */1271 ack_seq = sk->acked_seq;
1272
1273
1274 if (!after(skb->seq, ack_seq)) {1275 if (after(skb->end_seq, ack_seq)) {1276 /* the packet straddles our window end */1277 structsk_buff_head * list = &sk->receive_queue;
1278 structsk_buff * next;
1279 ack_seq = tcp_queue_ack(skb, sk);
1280
1281 /*1282 * Do we have any old packets to ack that the above1283 * made visible? (Go forward from skb)1284 */1285 next = skb->next;
1286 while (next != (structsk_buff *) list) {1287 if (after(next->seq, ack_seq))
1288 break;
1289 if (after(next->end_seq, ack_seq))
1290 ack_seq = tcp_queue_ack(next, sk);
1291 next = next->next;
1292 }1293
1294 /*1295 * Ok, we found new data, update acked_seq as1296 * necessary (and possibly send the actual1297 * ACK packet).1298 */1299 sk->acked_seq = ack_seq;
1300
1301 }else{1302 if (sk->debug)
1303 printk("Ack duplicate packet.\n");
1304 tcp_send_ack(sk);
1305 return;
1306 }1307
1308
1309 /*1310 * Delay the ack if possible. Send ack's to1311 * fin frames immediately as there shouldn't be1312 * anything more to come.1313 */1314 if (!sk->delay_acks || th->fin) {1315 tcp_send_ack(sk);
1316 }else{1317 /*1318 * If psh is set we assume it's an1319 * interactive session that wants quick1320 * acks to avoid nagling too much. 1321 */1322 intdelay = HZ/2;
1323 if (th->psh)
1324 delay = HZ/50;
1325 tcp_send_delayed_ack(sk, delay);
1326 }1327
1328 /*1329 * Tell the user we have some more data.1330 */1331
1332 if (!sk->dead)
1333 sk->data_ready(sk,0);
1334
1335 }1336 else1337 {1338 /*1339 * If we've missed a packet, send an ack.1340 * Also start a timer to send another.1341 *1342 * 4.3reno machines look for these kind of acks so1343 * they can do fast recovery. Three identical 'old'1344 * acks lets it know that one frame has been lost1345 * and should be resent. Because this is before the1346 * whole window of data has timed out it can take1347 * one lost frame per window without stalling.1348 * [See Jacobson RFC1323, Stevens TCP/IP illus vol2]1349 *1350 * We also should be spotting triple bad sequences.1351 * [We now do this.]1352 *1353 */1354
1355 if (!skb->acked)
1356 {1357 if(sk->debug)
1358 printk("Ack past end of seq packet.\n");
1359 tcp_send_ack(sk);
1360 tcp_send_delayed_ack(sk,HZ/2);
1361 }1362 }1363 }1364
1365
1366 /*1367 * This routine handles the data. If there is room in the buffer,1368 * it will be have already been moved into it. If there is no1369 * room, then we will just have to discard the packet.1370 */1371
1372 staticinttcp_data(structsk_buff *skb, structsock *sk,
/* */1373 unsignedlongsaddr, unsignedintlen)
1374 {1375 structtcphdr *th;
1376 u32new_seq, shut_seq;
1377
1378 th = skb->h.th;
1379 skb_pull(skb,th->doff*4);
1380 skb_trim(skb,len-(th->doff*4));
1381
1382 /*1383 * The bytes in the receive read/assembly queue has increased. Needed for the1384 * low memory discard algorithm 1385 */1386
1387 sk->bytes_rcv += skb->len;
1388
1389 if (skb->len == 0 && !th->fin)
1390 {1391 /* 1392 * Don't want to keep passing ack's back and forth. 1393 * (someone sent us dataless, boring frame)1394 */1395 if (!th->ack)
1396 tcp_send_ack(sk);
1397 kfree_skb(skb, FREE_READ);
1398 return(0);
1399 }1400
1401 /*1402 * We no longer have anyone receiving data on this connection.1403 */1404
1405 #ifndef TCP_DONT_RST_SHUTDOWN
1406
1407 if(sk->shutdown & RCV_SHUTDOWN)
1408 {1409 /*1410 * FIXME: BSD has some magic to avoid sending resets to1411 * broken 4.2 BSD keepalives. Much to my surprise a few non1412 * BSD stacks still have broken keepalives so we want to1413 * cope with it.1414 */1415
1416 if(skb->len) /* We don't care if it's just an ack or1417 a keepalive/window probe */1418 {1419 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */1420
1421 /* Do this the way 4.4BSD treats it. Not what I'd1422 regard as the meaning of the spec but it's what BSD1423 does and clearly they know everything 8) */1424
1425 /*1426 * This is valid because of two things1427 *1428 * a) The way tcp_data behaves at the bottom.1429 * b) A fin takes effect when read not when received.1430 */1431
1432 shut_seq = sk->acked_seq+1; /* Last byte */1433
1434 if(after(new_seq,shut_seq))
1435 {1436 if(sk->debug)
1437 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1438 sk, new_seq, shut_seq, sk->blog);
1439 if(sk->dead)
1440 {1441 sk->acked_seq = new_seq + th->fin;
1442 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1443 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1444 tcp_statistics.TcpEstabResets++;
1445 sk->err = EPIPE;
1446 sk->error_report(sk);
1447 sk->shutdown = SHUTDOWN_MASK;
1448 tcp_set_state(sk,TCP_CLOSE);
1449 kfree_skb(skb, FREE_READ);
1450 return 0;
1451 }1452 }1453 }1454 }1455
1456 #endif1457
1458 tcp_queue(skb, sk, th);
1459
1460 return(0);
1461 }1462
1463
1464 /*1465 * This routine is only called when we have urgent data1466 * signalled. Its the 'slow' part of tcp_urg. It could be1467 * moved inline now as tcp_urg is only called from one1468 * place. We handle URGent data wrong. We have to - as1469 * BSD still doesn't use the correction from RFC961.1470 */1471
1472 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */1473 {1474 u32ptr = ntohs(th->urg_ptr);
1475
1476 if (ptr)
1477 ptr--;
1478 ptr += ntohl(th->seq);
1479
1480 /* ignore urgent data that we've already seen and read */1481 if (after(sk->copied_seq, ptr))
1482 return;
1483
1484 /* do we already have a newer (or duplicate) urgent pointer? */1485 if (sk->urg_data && !after(ptr, sk->urg_seq))
1486 return;
1487
1488 /* tell the world about our new urgent pointer */1489 if (sk->proc != 0) {1490 if (sk->proc > 0) {1491 kill_proc(sk->proc, SIGURG, 1);
1492 }else{1493 kill_pg(-sk->proc, SIGURG, 1);
1494 }1495 }1496 sk->urg_data = URG_NOTYET;
1497 sk->urg_seq = ptr;
1498 }1499
1500 /*1501 * This is the 'fast' part of urgent handling.1502 */1503
1504 staticinlinevoidtcp_urg(structsock *sk, structtcphdr *th, unsignedlonglen)
/* */1505 {1506 /*1507 * Check if we get a new urgent pointer - normally not 1508 */1509
1510 if (th->urg)
1511 tcp_check_urg(sk,th);
1512
1513 /*1514 * Do we wait for any urgent data? - normally not1515 */1516
1517 if (sk->urg_data == URG_NOTYET) {1518 u32ptr;
1519
1520 /*1521 * Is the urgent pointer pointing into this packet? 1522 */1523 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1524 if (ptr < len) {1525 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
1526 if (!sk->dead)
1527 sk->data_ready(sk,0);
1528 }1529 }1530 }1531
1532 /*1533 * This should be a bit smarter and remove partially1534 * overlapping stuff too, but this should be good1535 * enough for any even remotely normal case (and the1536 * worst that can happen is that we have a few1537 * unnecessary packets in the receive queue).1538 *1539 * This function is never called with an empty list..1540 */1541 staticinlinevoidtcp_remove_dups(structsk_buff_head * list)
/* */1542 {1543 structsk_buff * next = list->next;
1544
1545 for (;;) {1546 structsk_buff * skb = next;
1547 next = next->next;
1548 if (next == (structsk_buff *) list)
1549 break;
1550 if (before(next->end_seq, skb->end_seq)) {1551 __skb_unlink(next, list);
1552 kfree_skb(next, FREE_READ);
1553 next = skb;
1554 continue;
1555 }1556 if (next->seq != skb->seq)
1557 continue;
1558 __skb_unlink(skb, list);
1559 kfree_skb(skb, FREE_READ);
1560 }1561 }1562
1563 /*1564 * Throw out all unnecessary packets: we've gone over the1565 * receive queue limit. This shouldn't happen in a normal1566 * TCP connection, but we might have gotten duplicates etc.1567 */1568 staticvoidprune_queue(structsk_buff_head * list)
/* */1569 {1570 for (;;) {1571 structsk_buff * skb = list->prev;
1572
1573 /* gone through it all? */1574 if (skb == (structsk_buff *) list)
1575 break;
1576 if (!skb->acked) {1577 __skb_unlink(skb, list);
1578 kfree_skb(skb, FREE_READ);
1579 continue;
1580 }1581 tcp_remove_dups(list);
1582 break;
1583 }1584 }1585
1586 /*1587 * A TCP packet has arrived.1588 * skb->h.raw is the TCP header.1589 */1590
1591 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */1592 __u32daddr, unsignedshortlen,
1593 __u32saddr, intredo, structinet_protocol * protocol)
1594 {1595 structtcphdr *th;
1596 structsock *sk;
1597 intsyn_ok=0;
1598
1599 /*1600 * "redo" is 1 if we have already seen this skb but couldn't1601 * use it at that time (the socket was locked). In that case1602 * we have already done a lot of the work (looked up the socket1603 * etc).1604 */1605 th = skb->h.th;
1606 sk = skb->sk;
1607 if (!redo) {1608 tcp_statistics.TcpInSegs++;
1609 if (skb->pkt_type!=PACKET_HOST)
1610 gotodiscard_it;
1611
1612 /*1613 * Pull up the IP header.1614 */1615
1616 skb_pull(skb, skb->h.raw-skb->data);
1617
1618 /*1619 * Try to use the device checksum if provided.1620 */1621 switch (skb->ip_summed)
1622 {1623 caseCHECKSUM_NONE:
1624 skb->csum = csum_partial((char *)th, len, 0);
1625 caseCHECKSUM_HW:
1626 if (tcp_check(th, len, saddr, daddr, skb->csum))
1627 gotodiscard_it;
1628 default:
1629 /* CHECKSUM_UNNECESSARY */1630 }1631 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1632 if (!sk)
1633 gotono_tcp_socket;
1634 skb->sk = sk;
1635 skb->seq = ntohl(th->seq);
1636 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1637 skb->ack_seq = ntohl(th->ack_seq);
1638
1639 skb->acked = 0;
1640 skb->used = 0;
1641 skb->free = 1;
1642 skb->saddr = daddr;
1643 skb->daddr = saddr;
1644
1645 /*1646 * We may need to add it to the backlog here. 1647 */1648 if (sk->users)
1649 {1650 __skb_queue_tail(&sk->back_log, skb);
1651 return(0);
1652 }1653 }1654
1655 /*1656 * If this socket has got a reset it's to all intents and purposes 1657 * really dead. Count closed sockets as dead.1658 *1659 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD1660 * simply drops data. This seems incorrect as a 'closed' TCP doesn't1661 * exist so should cause resets as if the port was unreachable.1662 */1663
1664 if (sk->zapped || sk->state==TCP_CLOSE)
1665 gotono_tcp_socket;
1666
1667 if (!sk->prot)
1668 {1669 printk("IMPOSSIBLE 3\n");
1670 return(0);
1671 }1672
1673
1674 /*1675 * Charge the memory to the socket. 1676 */1677
1678 skb->sk=sk;
1679 atomic_add(skb->truesize, &sk->rmem_alloc);
1680
1681 /*1682 * We should now do header prediction.1683 */1684
1685 /*1686 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We1687 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug1688 * compatibility. We also set up variables more thoroughly [Karn notes in the1689 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].1690 */1691
1692 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */1693 {1694
1695 /*1696 * Now deal with unusual cases.1697 */1698
1699 if(sk->state==TCP_LISTEN)
1700 {1701 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */1702 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1703
1704 /*1705 * We don't care for RST, and non SYN are absorbed (old segments)1706 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the1707 * netmask on a running connection it can go broadcast. Even Sun's have1708 * this problem so I'm ignoring it 1709 */1710
1711 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1712 {1713 kfree_skb(skb, FREE_READ);
1714 return 0;
1715 }1716
1717 /* 1718 * Guess we need to make a new socket up 1719 */1720
1721 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1722
1723 /*1724 * Now we have several options: In theory there is nothing else1725 * in the frame. KA9Q has an option to send data with the syn,1726 * BSD accepts data with the syn up to the [to be] advertised window1727 * and Solaris 2.1 gives you a protocol error. For now we just ignore1728 * it, that fits the spec precisely and avoids incompatibilities. It1729 * would be nice in future to drop through and process the data.1730 *1731 * Now TTCP is starting to use we ought to queue this data.1732 */1733
1734 return 0;
1735 }1736
1737 /* 1738 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN1739 * then it's a new connection1740 */1741
1742 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1743 {1744 kfree_skb(skb, FREE_READ);
1745 return 0;
1746 }1747
1748 /*1749 * SYN sent means we have to look for a suitable ack and either reset1750 * for bad matches or go to connected. The SYN_SENT case is unusual and should1751 * not be in line code. [AC]1752 */1753
1754 if(sk->state==TCP_SYN_SENT)
1755 {1756 /* Crossed SYN or previous junk segment */1757 if(th->ack)
1758 {1759 /* We got an ack, but it's not a good ack */1760 if(!tcp_ack(sk,th,skb->ack_seq,len))
1761 {1762 /* Reset the ack - it's an ack from a 1763 different connection [ th->rst is checked in tcp_send_reset()] */1764 tcp_statistics.TcpAttemptFails++;
1765 tcp_send_reset(daddr, saddr, th,
1766 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1767 kfree_skb(skb, FREE_READ);
1768 return(0);
1769 }1770 if(th->rst)
1771 returntcp_reset(sk,skb);
1772 if(!th->syn)
1773 {1774 /* A valid ack from a different connection1775 start. Shouldn't happen but cover it */1776 tcp_statistics.TcpAttemptFails++;
1777 tcp_send_reset(daddr, saddr, th,
1778 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1779 kfree_skb(skb, FREE_READ);
1780 return 0;
1781 }1782 /*1783 * Ok.. it's good. Set up sequence numbers and1784 * move to established.1785 */1786 syn_ok=1; /* Don't reset this connection for the syn */1787 sk->acked_seq = skb->seq+1;
1788 sk->lastwin_seq = skb->seq+1;
1789 sk->fin_seq = skb->seq;
1790 tcp_send_ack(sk);
1791 tcp_set_state(sk, TCP_ESTABLISHED);
1792 tcp_options(sk,th);
1793 sk->dummy_th.dest=th->source;
1794 sk->copied_seq = sk->acked_seq;
1795 if(!sk->dead)
1796 {1797 sk->state_change(sk);
1798 sock_wake_async(sk->socket, 0);
1799 }1800 if(sk->max_window==0)
1801 {1802 sk->max_window = 32;
1803 sk->mss = min(sk->max_window, sk->mtu);
1804 }1805 }1806 else1807 {1808 /* See if SYN's cross. Drop if boring */1809 if(th->syn && !th->rst)
1810 {1811 /* Crossed SYN's are fine - but talking to1812 yourself is right out... */1813 if(sk->saddr==saddr && sk->daddr==daddr &&
1814 sk->dummy_th.source==th->source &&
1815 sk->dummy_th.dest==th->dest)
1816 {1817 tcp_statistics.TcpAttemptFails++;
1818 returntcp_reset(sk,skb);
1819 }1820 tcp_set_state(sk,TCP_SYN_RECV);
1821
1822 /*1823 * FIXME:1824 * Must send SYN|ACK here1825 */1826 }1827 /* Discard junk segment */1828 kfree_skb(skb, FREE_READ);
1829 return 0;
1830 }1831 /*1832 * SYN_RECV with data maybe.. drop through1833 */1834 gotorfc_step6;
1835 }1836
1837 /*1838 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is1839 * a more complex suggestion for fixing these reuse issues in RFC16441840 * but not yet ready for general use. Also see RFC1379.1841 *1842 * Note the funny way we go back to the top of this function for1843 * this case ("goto try_next_socket"). That also takes care of1844 * checking "sk->users" for the new socket as well as doing all1845 * the normal tests on the packet.1846 */1847
1848 #defineBSD_TIME_WAIT1849 #ifdefBSD_TIME_WAIT1850 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1851 after(skb->seq, sk->acked_seq) && !th->rst)
1852 {1853 u32seq = sk->write_seq;
1854 if(sk->debug)
1855 printk("Doing a BSD time wait\n");
1856 tcp_statistics.TcpEstabResets++;
1857 atomic_sub(skb->truesize, &sk->rmem_alloc);
1858 skb->sk = NULL;
1859 sk->err=ECONNRESET;
1860 tcp_set_state(sk, TCP_CLOSE);
1861 sk->shutdown = SHUTDOWN_MASK;
1862 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1863 /* this is not really correct: we should check sk->users */1864 if (sk && sk->state==TCP_LISTEN)
1865 {1866 skb->sk = sk;
1867 atomic_add(skb->truesize, &sk->rmem_alloc);
1868 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1869 return 0;
1870 }1871 kfree_skb(skb, FREE_READ);
1872 return 0;
1873 }1874 #endif1875 }1876
1877 /*1878 * We are now in normal data flow (see the step list in the RFC)1879 * Note most of these are inline now. I'll inline the lot when1880 * I have time to test it hard and look at what gcc outputs 1881 */1882
1883 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1884 {1885 bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
1886 kfree_skb(skb, FREE_READ);
1887 return 0;
1888 }1889
1890 if(th->rst)
1891 returntcp_reset(sk,skb);
1892
1893 /*1894 * !syn_ok is effectively the state test in RFC793.1895 */1896
1897 if(th->syn && !syn_ok)
1898 {1899 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1900 returntcp_reset(sk,skb);
1901 }1902
1903 tcp_delack_estimator(sk);
1904
1905 /*1906 * Process the ACK1907 */1908
1909
1910 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1911 {1912 /*1913 * Our three way handshake failed.1914 */1915
1916 if(sk->state==TCP_SYN_RECV)
1917 {1918 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1919 }1920 kfree_skb(skb, FREE_READ);
1921 return 0;
1922 }1923
1924 rfc_step6: /* I'll clean this up later */1925
1926 /*1927 * If the accepted buffer put us over our queue size we1928 * now drop it (we must process the ack first to avoid1929 * deadlock cases).1930 */1931
1932 /*1933 * Process urgent data1934 */1935
1936 tcp_urg(sk, th, len);
1937
1938 /*1939 * Process the encapsulated data1940 */1941
1942 if(tcp_data(skb,sk, saddr, len))
1943 kfree_skb(skb, FREE_READ);
1944
1945 /*1946 * If our receive queue has grown past its limits,1947 * try to prune away duplicates etc..1948 */1949 if (sk->rmem_alloc > sk->rcvbuf)
1950 prune_queue(&sk->receive_queue);
1951
1952 /*1953 * And done1954 */1955
1956 return 0;
1957
1958 no_tcp_socket:
1959 /*1960 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)1961 */1962 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1963
1964 discard_it:
1965 /*1966 * Discard frame1967 */1968 skb->sk = NULL;
1969 kfree_skb(skb, FREE_READ);
1970 return 0;
1971 }