1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp_input.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * FIXES 23 * Pedro Roque : Double ACK bug 24 */ 25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 /* 30 * Policy code extracted so its now separate 31 */ 32
33 /* 34 * Called each time to estimate the delayed ack timeout. This is 35 * how it should be done so a fast link isn't impacted by ack delay. 36 */ 37
38 extern__inline__voidtcp_delack_estimator(structsock *sk)
/* */ 39 { 40 /* 41 * Delayed ACK time estimator. 42 */ 43
44 if (sk->lrcvtime == 0)
45 { 46 sk->lrcvtime = jiffies;
47 sk->ato = HZ/3;
48 } 49 else 50 { 51 intm;
52
53 m = jiffies - sk->lrcvtime;
54
55 sk->lrcvtime = jiffies;
56
57 if (m <= 0)
58 m = 1;
59
60 if (m > (sk->rtt >> 3))
61 { 62 sk->ato = sk->rtt >> 3;
63 /* 64 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); 65 */ 66 } 67 else 68 { 69 sk->ato = (sk->ato >> 1) + m;
70 /* 71 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); 72 */ 73 } 74 } 75 } 76
77 /* 78 * Called on frames that were known _not_ to have been 79 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 80 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. 81 */ 82
83 extern__inline__voidtcp_rtt_estimator(structsock *sk, structsk_buff *oskb)
/* */ 84 { 85 longm;
86 /* 87 * The following amusing code comes from Jacobson's 88 * article in SIGCOMM '88. Note that rtt and mdev 89 * are scaled versions of rtt and mean deviation. 90 * This is designed to be as fast as possible 91 * m stands for "measurement". 92 */ 93
94 m = jiffies - oskb->when; /* RTT */ 95 if(m<=0)
96 m=1; /* IS THIS RIGHT FOR <0 ??? */ 97 m -= (sk->rtt >> 3); /* m is now error in rtt est */ 98 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ 99 if (m < 0)
100 m = -m; /* m is now abs(error) */ 101 m -= (sk->mdev >> 2); /* similar update on mdev */ 102 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 103
104 /* 105 * Now update timeout. Note that this removes any backoff. 106 */ 107
108 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
109 if (sk->rto > 120*HZ)
110 sk->rto = 120*HZ;
111 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ 112 sk->rto = HZ/5;
113 sk->backoff = 0;
114 } 115
116 /* 117 * Cached last hit socket 118 */ 119
120 staticvolatileunsignedlongth_cache_saddr, th_cache_daddr;
121 staticvolatileunsignedshortth_cache_dport, th_cache_sport;
122 staticvolatilestructsock *th_cache_sk;
123
124 voidtcp_cache_zap(void)
/* */ 125 { 126 th_cache_sk=NULL;
127 } 128
129 /* 130 * Find the socket, using the last hit cache if applicable. The cache is not quite 131 * right... 132 */ 133
134 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */ 135 { 136 structsock * sk;
137
138 sk = (structsock *) th_cache_sk;
139 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
140 sport != th_cache_sport || dport != th_cache_dport) { 141 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
142 if (sk) { 143 th_cache_saddr=saddr;
144 th_cache_daddr=daddr;
145 th_cache_dport=dport;
146 th_cache_sport=sport;
147 th_cache_sk=sk;
148 } 149 } 150 returnsk;
151 } 152
153 /* 154 * React to a out-of-window TCP sequence number in an incoming packet 155 */ 156
157 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, u32end_seq,
/* */ 158 structdevice *dev)
159 { 160 if (th->rst)
161 return;
162
163 /* 164 * Send a reset if we get something not ours and we are 165 * unsynchronized. Note: We don't do anything to our end. We 166 * are just killing the bogus remote connection then we will 167 * connect again and it will work (with luck). 168 */ 169
170 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
171 { 172 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
173 return;
174 } 175
176 /* 177 * 4.3reno machines look for these kind of acks so they can do fast 178 * recovery. Three identical 'old' acks lets it know that one frame has 179 * been lost and should be resent. Because this is before the whole window 180 * of data has timed out it can take one lost frame per window without 181 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2] 182 */ 183 tcp_send_ack(sk);
184 } 185
186 /* 187 * This functions checks to see if the tcp header is actually acceptable. 188 */ 189
190 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */ 191 { 192 u32end_window = sk->acked_seq + sk->window;
193 return/* if start is at end of window, end must be too (zero window) */ 194 (seq == end_window && seq == end_seq) ||
195 /* if start is before end of window, check for interest */ 196 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
197 } 198
199 /* 200 * When we get a reset we do this. This probably is a tcp_output routine 201 * really. 202 */ 203
204 staticinttcp_reset(structsock *sk, structsk_buff *skb)
/* */ 205 { 206 sk->zapped = 1;
207 /* 208 * We want the right error as BSD sees it (and indeed as we do). 209 */ 210 sk->err = ECONNRESET;
211 if (sk->state == TCP_SYN_SENT)
212 sk->err = ECONNREFUSED;
213 if (sk->state == TCP_CLOSE_WAIT)
214 sk->err = EPIPE;
215 #ifdef CONFIG_TCP_RFC1337
216 /* 217 * Time wait assassination protection [RFC1337] 218 * 219 * This is a good idea, but causes more sockets to take time to close. 220 * 221 * Ian Heavens has since shown this is an inadequate fix for the protocol 222 * bug in question. 223 */ 224 if(sk->state!=TCP_TIME_WAIT)
225 { 226 tcp_set_state(sk,TCP_CLOSE);
227 sk->shutdown = SHUTDOWN_MASK;
228 } 229 #else 230 tcp_set_state(sk,TCP_CLOSE);
231 sk->shutdown = SHUTDOWN_MASK;
232 #endif 233 if (!sk->dead)
234 sk->state_change(sk);
235 kfree_skb(skb, FREE_READ);
236 return(0);
237 } 238
239
240 /* 241 * Look for tcp options. Parses everything but only knows about MSS. 242 * This routine is always called with the packet containing the SYN. 243 * However it may also be called with the ack to the SYN. So you 244 * can't assume this is always the SYN. It's always called after 245 * we have set up sk->mtu to our own MTU. 246 * 247 * We need at minimum to add PAWS support here. Possibly large windows 248 * as Linux gets deployed on 100Mb/sec networks. 249 */ 250
251 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */ 252 { 253 unsignedchar *ptr;
254 intlength=(th->doff*4)-sizeof(structtcphdr);
255 intmss_seen = 0;
256
257 ptr = (unsignedchar *)(th + 1);
258
259 while(length>0)
260 { 261 intopcode=*ptr++;
262 intopsize=*ptr++;
263 switch(opcode)
264 { 265 caseTCPOPT_EOL:
266 return;
267 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 268 length--;
269 ptr--; /* the opsize=*ptr++ above was a mistake */ 270 continue;
271
272 default:
273 if(opsize<=2) /* Avoid silly options looping forever */ 274 return;
275 switch(opcode)
276 { 277 caseTCPOPT_MSS:
278 if(opsize==4 && th->syn)
279 { 280 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
281 mss_seen = 1;
282 } 283 break;
284 /* Add other options here as people feel the urge to implement stuff like large windows */ 285 } 286 ptr+=opsize-2;
287 length-=opsize;
288 } 289 } 290 if (th->syn)
291 { 292 if (! mss_seen)
293 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ 294 } 295 #ifdefCONFIG_INET_PCTCP 296 sk->mss = min(sk->max_window >> 1, sk->mtu);
297 #else 298 sk->mss = min(sk->max_window, sk->mtu);
299 sk->max_unacked = 2 * sk->mss;
300 #endif 301 } 302
303
304 /* 305 * This routine handles a connection request. 306 * It should make sure we haven't already responded. 307 * Because of the way BSD works, we have to send a syn/ack now. 308 * This also means it will be harder to close a socket which is 309 * listening. 310 */ 311
312 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */ 313 u32daddr, u32saddr, structoptions *opt, structdevice *dev, u32seq)
314 { 315 structsock *newsk;
316 structtcphdr *th;
317 structrtable *rt;
318
319 th = skb->h.th;
320
321 /* If the socket is dead, don't accept the connection. */ 322 if (!sk->dead)
323 { 324 sk->data_ready(sk,0);
325 } 326 else 327 { 328 if(sk->debug)
329 printk("Reset on %p: Connect on dead socket.\n",sk);
330 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
331 tcp_statistics.TcpAttemptFails++;
332 kfree_skb(skb, FREE_READ);
333 return;
334 } 335
336 /* 337 * Make sure we can accept more. This will prevent a 338 * flurry of syns from eating up all our memory. 339 * 340 * BSD does some funnies here and allows 3/2 times the 341 * set backlog as a fudge factor. Thats just too gross. 342 */ 343
344 if (sk->ack_backlog >= sk->max_ack_backlog)
345 { 346 tcp_statistics.TcpAttemptFails++;
347 kfree_skb(skb, FREE_READ);
348 return;
349 } 350
351 /* 352 * We need to build a new sock struct. 353 * It is sort of bad to have a socket without an inode attached 354 * to it, but the wake_up's will just wake up the listening socket, 355 * and if the listening socket is destroyed before this is taken 356 * off of the queue, this will take care of it. 357 */ 358
359 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
360 if (newsk == NULL)
361 { 362 /* just ignore the syn. It will get retransmitted. */ 363 tcp_statistics.TcpAttemptFails++;
364 kfree_skb(skb, FREE_READ);
365 return;
366 } 367
368 memcpy(newsk, sk, sizeof(*newsk));
369 newsk->opt = NULL;
370 newsk->ip_route_cache = NULL;
371 if (opt && opt->optlen)
372 { 373 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
374 if (!sk->opt)
375 { 376 kfree_s(newsk, sizeof(structsock));
377 tcp_statistics.TcpAttemptFails++;
378 kfree_skb(skb, FREE_READ);
379 return;
380 } 381 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
382 { 383 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
384 kfree_s(newsk, sizeof(structsock));
385 tcp_statistics.TcpAttemptFails++;
386 kfree_skb(skb, FREE_READ);
387 return;
388 } 389 } 390 skb_queue_head_init(&newsk->write_queue);
391 skb_queue_head_init(&newsk->receive_queue);
392 newsk->send_head = NULL;
393 newsk->send_tail = NULL;
394 skb_queue_head_init(&newsk->back_log);
395 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ 396 newsk->rto = TCP_TIMEOUT_INIT;
397 newsk->mdev = TCP_TIMEOUT_INIT<<1;
398 newsk->max_window = 0;
399 newsk->cong_window = 1;
400 newsk->cong_count = 0;
401 newsk->ssthresh = 0;
402 newsk->backoff = 0;
403 newsk->blog = 0;
404 newsk->intr = 0;
405 newsk->proc = 0;
406 newsk->done = 0;
407 newsk->partial = NULL;
408 newsk->pair = NULL;
409 newsk->wmem_alloc = 0;
410 newsk->rmem_alloc = 0;
411 newsk->localroute = sk->localroute;
412
413 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
414
415 newsk->err = 0;
416 newsk->shutdown = 0;
417 newsk->ack_backlog = 0;
418 newsk->acked_seq = skb->seq+1;
419 newsk->lastwin_seq = skb->seq+1;
420 newsk->delay_acks = 1;
421 newsk->copied_seq = skb->seq+1;
422 newsk->fin_seq = skb->seq;
423 newsk->state = TCP_SYN_RECV;
424 newsk->timeout = 0;
425 newsk->ip_xmit_timeout = 0;
426 newsk->write_seq = seq;
427 newsk->window_seq = newsk->write_seq;
428 newsk->rcv_ack_seq = newsk->write_seq;
429 newsk->urg_data = 0;
430 newsk->retransmits = 0;
431 newsk->linger=0;
432 newsk->destroy = 0;
433 init_timer(&newsk->timer);
434 newsk->timer.data = (unsignedlong)newsk;
435 newsk->timer.function = &net_timer;
436 init_timer(&newsk->delack_timer);
437 newsk->delack_timer.data = (unsignedlong)newsk;
438 newsk->delack_timer.function = tcp_delack_timer;
439 init_timer(&newsk->retransmit_timer);
440 newsk->retransmit_timer.data = (unsignedlong)newsk;
441 newsk->retransmit_timer.function = tcp_retransmit_timer;
442 newsk->dummy_th.source = skb->h.th->dest;
443 newsk->dummy_th.dest = skb->h.th->source;
444
445 /* 446 * Swap these two, they are from our point of view. 447 */ 448
449 newsk->daddr = saddr;
450 newsk->saddr = daddr;
451 newsk->rcv_saddr = daddr;
452
453 put_sock(newsk->num,newsk);
454 newsk->acked_seq = skb->seq + 1;
455 newsk->copied_seq = skb->seq + 1;
456 newsk->socket = NULL;
457
458 /* 459 * Grab the ttl and tos values and use them 460 */ 461
462 newsk->ip_ttl=sk->ip_ttl;
463 newsk->ip_tos=skb->ip_hdr->tos;
464
465 /* 466 * Use 512 or whatever user asked for 467 */ 468
469 /* 470 * Note use of sk->user_mss, since user has no direct access to newsk 471 */ 472
473 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
474 newsk->ip_route_cache = rt;
475
476 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
477 newsk->window_clamp = rt->rt_window;
478 else 479 newsk->window_clamp = 0;
480
481 if (sk->user_mss)
482 newsk->mtu = sk->user_mss;
483 elseif (rt)
484 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
485 else 486 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
487
488 /* 489 * But not bigger than device MTU 490 */ 491
492 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
493
494 #ifdefCONFIG_SKIP 495
496 /* 497 * SKIP devices set their MTU to 65535. This is so they can take packets 498 * unfragmented to security process then fragment. They could lie to the 499 * TCP layer about a suitable MTU, but its easier to let skip sort it out 500 * simply because the final package we want unfragmented is going to be 501 * 502 * [IPHDR][IPSP][Security data][Modified TCP data][Security data] 503 */ 504
505 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ 506 sk->mtu=skip_pick_mtu(sk->mtu,dev);
507 #endif 508 /* 509 * This will min with what arrived in the packet 510 */ 511
512 tcp_options(newsk,skb->h.th);
513
514 tcp_cache_zap();
515 tcp_send_synack(newsk, sk, skb);
516 } 517
518
519 /* 520 * Handle a TCP window that shrunk on us. It shouldn't happen, 521 * but.. 522 * 523 * We may need to move packets from the send queue 524 * to the write queue, if the window has been shrunk on us. 525 * The RFC says you are not allowed to shrink your window 526 * like this, but if the other end does, you must be able 527 * to deal with it. 528 */ 529 voidtcp_window_shrunk(structsock * sk, u32window_seq)
/* */ 530 { 531 structsk_buff *skb;
532 structsk_buff *skb2;
533 structsk_buff *wskb = NULL;
534
535 skb2 = sk->send_head;
536 sk->send_head = NULL;
537 sk->send_tail = NULL;
538
539 /* 540 * This is an artifact of a flawed concept. We want one 541 * queue and a smarter send routine when we send all. 542 */ 543 cli();
544 while (skb2 != NULL)
545 { 546 skb = skb2;
547 skb2 = skb->link3;
548 skb->link3 = NULL;
549 if (after(skb->end_seq, window_seq))
550 { 551 if (sk->packets_out > 0)
552 sk->packets_out--;
553 /* We may need to remove this from the dev send list. */ 554 if (skb->next != NULL)
555 { 556 skb_unlink(skb);
557 } 558 /* Now add it to the write_queue. */ 559 if (wskb == NULL)
560 skb_queue_head(&sk->write_queue,skb);
561 else 562 skb_append(wskb,skb);
563 wskb = skb;
564 } 565 else 566 { 567 if (sk->send_head == NULL)
568 { 569 sk->send_head = skb;
570 sk->send_tail = skb;
571 } 572 else 573 { 574 sk->send_tail->link3 = skb;
575 sk->send_tail = skb;
576 } 577 skb->link3 = NULL;
578 } 579 } 580 sti();
581 } 582
583
584 /* 585 * This routine deals with incoming acks, but not outgoing ones. 586 * 587 * This routine is totally _WRONG_. The list structuring is wrong, 588 * the algorithm is wrong, the code is wrong. 589 */ 590
591 staticinttcp_ack(structsock *sk, structtcphdr *th, u32ack, intlen)
/* */ 592 { 593 intflag = 0;
594 u32window_seq;
595
596 /* 597 * 1 - there was data in packet as well as ack or new data is sent or 598 * in shutdown state 599 * 2 - data from retransmit queue was acked and removed 600 * 4 - window shrunk or data from retransmit queue was acked and removed 601 */ 602
603 if(sk->zapped)
604 return(1); /* Dead, cant ack any more so why bother */ 605
606 /* 607 * We have dropped back to keepalive timeouts. Thus we have 608 * no retransmits pending. 609 */ 610
611 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
612 sk->retransmits = 0;
613
614 /* 615 * If the ack is newer than sent or older than previous acks 616 * then we can probably ignore it. 617 */ 618
619 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
620 gotouninteresting_ack;
621
622 /* 623 * If there is data set flag 1 624 */ 625
626 if (len != th->doff*4)
627 flag |= 1;
628
629 /* 630 * Have we discovered a larger window 631 */ 632 window_seq = ntohs(th->window);
633 if (window_seq > sk->max_window)
634 { 635 sk->max_window = window_seq;
636 #ifdefCONFIG_INET_PCTCP 637 /* Hack because we don't send partial packets to non SWS 638 handling hosts */ 639 sk->mss = min(window_seq>>1, sk->mtu);
640 #else 641 sk->mss = min(window_seq, sk->mtu);
642 #endif 643 } 644 window_seq += ack;
645
646 /* 647 * See if our window has been shrunk. 648 */ 649 if (after(sk->window_seq, window_seq)) { 650 flag |= 4;
651 tcp_window_shrunk(sk, window_seq);
652 } 653
654 /* 655 * Pipe has emptied 656 */ 657 if (sk->send_tail == NULL || sk->send_head == NULL)
658 { 659 sk->send_head = NULL;
660 sk->send_tail = NULL;
661 sk->packets_out= 0;
662 } 663
664 /* 665 * We don't want too many packets out there. 666 */ 667
668 if (sk->ip_xmit_timeout == TIME_WRITE &&
669 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
670 { 671
672 /* 673 * This is Jacobson's slow start and congestion avoidance. 674 * SIGCOMM '88, p. 328. Because we keep cong_window in integral 675 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 676 * counter and increment it once every cwnd times. It's possible 677 * that this should be done only if sk->retransmits == 0. I'm 678 * interpreting "new data is acked" as including data that has 679 * been retransmitted but is just now being acked. 680 */ 681 if (sk->cong_window < sk->ssthresh)
682 /* 683 * In "safe" area, increase 684 */ 685 sk->cong_window++;
686 else 687 { 688 /* 689 * In dangerous area, increase slowly. In theory this is 690 * sk->cong_window += 1 / sk->cong_window 691 */ 692 if (sk->cong_count >= sk->cong_window)
693 { 694 sk->cong_window++;
695 sk->cong_count = 0;
696 } 697 else 698 sk->cong_count++;
699 } 700 } 701
702 /* 703 * Remember the highest ack received and update the 704 * right hand window edge of the host. 705 * We do a bit of work here to track number of times we've 706 * seen this ack without a change in the right edge of the 707 * window and no data in the packet. 708 * This will allow us to do fast retransmits. 709 */ 710
711 /* We are looking for duplicate ACKs here. 712 * An ACK is a duplicate if: 713 * (1) it has the same sequence number as the largest number we've seen, 714 * (2) it has the same window as the last ACK, 715 * (3) we have outstanding data that has not been ACKed 716 * (4) The packet was not carrying any data. 717 * I've tried to order these in occurance of most likely to fail 718 * to least likely to fail. 719 * [These are the rules BSD stacks use to determine if an ACK is a 720 * duplicate.] 721 */ 722
723 if (sk->rcv_ack_seq == ack 724 && sk->window_seq == window_seq 725 && !(flag&1)
726 && before(ack, sk->sent_seq))
727 { 728 /* See draft-stevens-tcpca-spec-01 for explanation 729 * of what we are doing here. 730 */ 731 sk->rcv_ack_cnt++;
732 if (sk->rcv_ack_cnt == MAX_DUP_ACKS+1) { 733 sk->ssthresh = max(sk->cong_window >> 1, 2);
734 sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1;
735 tcp_do_retransmit(sk,0);
736 /* reduce the count. We don't want to be 737 * seen to be in "retransmit" mode if we 738 * are doing a fast retransmit. 739 */ 740 sk->retransmits--;
741 }elseif (sk->rcv_ack_cnt > MAX_DUP_ACKS+1) { 742 sk->cong_window++;
743 /* 744 * At this point we are suppose to transmit a NEW 745 * packet (not retransmit the missing packet, 746 * this would only get us into a retransmit war.) 747 * I think that having just adjusted cong_window 748 * we will transmit the new packet below. 749 */ 750 } 751 } 752 else 753 { 754 if (sk->rcv_ack_cnt > MAX_DUP_ACKS) { 755 sk->cong_window = sk->ssthresh;
756 } 757 sk->window_seq = window_seq;
758 sk->rcv_ack_seq = ack;
759 sk->rcv_ack_cnt = 1;
760 } 761
762 /* 763 * We passed data and got it acked, remove any soft error 764 * log. Something worked... 765 */ 766
767 sk->err_soft = 0;
768
769 /* 770 * If this ack opens up a zero window, clear backoff. It was 771 * being used to time the probes, and is probably far higher than 772 * it needs to be for normal retransmission. 773 */ 774
775 if (sk->ip_xmit_timeout == TIME_PROBE0)
776 { 777 sk->retransmits = 0; /* Our probe was answered */ 778
779 /* 780 * Was it a usable window open ? 781 */ 782
783 if (!skb_queue_empty(&sk->write_queue) && /* should always be true */ 784 ! before (sk->window_seq, sk->write_queue.next->end_seq))
785 { 786 sk->backoff = 0;
787
788 /* 789 * Recompute rto from rtt. this eliminates any backoff. 790 */ 791
792 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
793 if (sk->rto > 120*HZ)
794 sk->rto = 120*HZ;
795 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about 796 .2 of a second because of BSD delayed acks - on a 100Mb/sec link 797 .2 of a second is going to need huge windows (SIGH) */ 798 sk->rto = HZ/5;
799 } 800 } 801
802 /* 803 * See if we can take anything off of the retransmit queue. 804 */ 805
806 for (;;) { 807 structsk_buff * skb = sk->send_head;
808 if (!skb)
809 break;
810
811 /* Check for a bug. */ 812 if (skb->link3 && after(skb->end_seq, skb->link3->end_seq))
813 printk("INET: tcp.c: *** bug send_list out of order.\n");
814
815 /* 816 * If our packet is before the ack sequence we can 817 * discard it as it's confirmed to have arrived the other end. 818 */ 819
820 if (after(skb->end_seq, ack))
821 break;
822
823 if (sk->retransmits)
824 { 825 /* 826 * We were retransmitting. don't count this in RTT est 827 */ 828 flag |= 2;
829 } 830
831 if ((sk->send_head = skb->link3) == NULL)
832 { 833 sk->send_tail = NULL;
834 sk->retransmits = 0;
835 } 836 /* 837 * Note that we only reset backoff and rto in the 838 * rtt recomputation code. And that doesn't happen 839 * if there were retransmissions in effect. So the 840 * first new packet after the retransmissions is 841 * sent with the backoff still in effect. Not until 842 * we get an ack from a non-retransmitted packet do 843 * we reset the backoff and rto. This allows us to deal 844 * with a situation where the network delay has increased 845 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) 846 */ 847
848 /* 849 * We have one less packet out there. 850 */ 851
852 if (sk->packets_out > 0)
853 sk->packets_out --;
854
855 if (!(flag&2)) /* Not retransmitting */ 856 tcp_rtt_estimator(sk,skb);
857 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 858 In this case as we just set it up */ 859 IS_SKB(skb);
860
861 /* 862 * We may need to remove this from the dev send list. 863 */ 864 cli();
865 if (skb->next)
866 skb_unlink(skb);
867 sti();
868 kfree_skb(skb, FREE_WRITE); /* write. */ 869 if (!sk->dead)
870 sk->write_space(sk);
871 } 872
873 /* 874 * XXX someone ought to look at this too.. at the moment, if skb_peek() 875 * returns non-NULL, we complete ignore the timer stuff in the else 876 * clause. We ought to organize the code so that else clause can 877 * (should) be executed regardless, possibly moving the PROBE timer 878 * reset over. The skb_peek() thing should only move stuff to the 879 * write queue, NOT also manage the timer functions. 880 */ 881
882 /* 883 * Maybe we can take some stuff off of the write queue, 884 * and put it onto the xmit queue. 885 */ 886 if (skb_peek(&sk->write_queue) != NULL)
887 { 888 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
889 (sk->retransmits == 0 ||
890 sk->ip_xmit_timeout != TIME_WRITE ||
891 !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
892 && sk->packets_out < sk->cong_window)
893 { 894 /* 895 * Add more data to the send queue. 896 */ 897 flag |= 1;
898 tcp_write_xmit(sk);
899 } 900 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
901 sk->send_head == NULL &&
902 sk->ack_backlog == 0 &&
903 sk->state != TCP_TIME_WAIT)
904 { 905 /* 906 * Data to queue but no room. 907 */ 908 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
909 } 910 } 911 else 912 { 913 /* 914 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets 915 * from TCP_CLOSE we don't do anything 916 * 917 * from anything else, if there is write data (or fin) pending, 918 * we use a TIME_WRITE timeout, else if keepalive we reset to 919 * a KEEPALIVE timeout, else we delete the timer. 920 * 921 * We do not set flag for nominal write data, otherwise we may 922 * force a state where we start to write itsy bitsy tidbits 923 * of data. 924 */ 925
926 switch(sk->state) { 927 caseTCP_TIME_WAIT:
928 /* 929 * keep us in TIME_WAIT until we stop getting packets, 930 * reset the timeout. 931 */ 932 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
933 break;
934 caseTCP_CLOSE:
935 /* 936 * don't touch the timer. 937 */ 938 break;
939 default:
940 /* 941 * Must check send_head and write_queue 942 * to determine which timeout to use. 943 */ 944 if (sk->send_head || !skb_queue_empty(&sk->write_queue)) { 945 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
946 }elseif (sk->keepopen) { 947 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
948 }else{ 949 del_timer(&sk->retransmit_timer);
950 sk->ip_xmit_timeout = 0;
951 } 952 break;
953 } 954 } 955
956 /* 957 * We have nothing queued but space to send. Send any partial 958 * packets immediately (end of Nagle rule application). 959 */ 960
961 if (sk->packets_out == 0
962 && sk->partial != NULL 963 && skb_queue_empty(&sk->write_queue)
964 && sk->send_head == NULL)
965 { 966 flag |= 1;
967 tcp_send_partial(sk);
968 } 969
970 /* 971 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and 972 * we are now waiting for an acknowledge to our FIN. The other end is 973 * already in TIME_WAIT. 974 * 975 * Move to TCP_CLOSE on success. 976 */ 977
978 if (sk->state == TCP_LAST_ACK)
979 { 980 if (!sk->dead)
981 sk->state_change(sk);
982 if(sk->debug)
983 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
984 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
985 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
986 { 987 flag |= 1;
988 sk->shutdown = SHUTDOWN_MASK;
989 tcp_set_state(sk,TCP_CLOSE);
990 return 1;
991 } 992 } 993
994 /* 995 * Incoming ACK to a FIN we sent in the case of our initiating the close. 996 * 997 * Move to FIN_WAIT2 to await a FIN from the other end. Set 998 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. 999 */1000
1001 if (sk->state == TCP_FIN_WAIT1)
1002 {1003
1004 if (!sk->dead)
1005 sk->state_change(sk);
1006 if (sk->rcv_ack_seq == sk->write_seq)
1007 {1008 flag |= 1;
1009 sk->shutdown |= SEND_SHUTDOWN;
1010 tcp_set_state(sk, TCP_FIN_WAIT2);
1011 }1012 }1013
1014 /*1015 * Incoming ACK to a FIN we sent in the case of a simultaneous close.1016 *1017 * Move to TIME_WAIT1018 */1019
1020 if (sk->state == TCP_CLOSING)
1021 {1022
1023 if (!sk->dead)
1024 sk->state_change(sk);
1025 if (sk->rcv_ack_seq == sk->write_seq)
1026 {1027 flag |= 1;
1028 tcp_time_wait(sk);
1029 }1030 }1031
1032 /*1033 * Final ack of a three way shake 1034 */1035
1036 if(sk->state==TCP_SYN_RECV)
1037 {1038 tcp_set_state(sk, TCP_ESTABLISHED);
1039 tcp_options(sk,th);
1040 sk->dummy_th.dest=th->source;
1041 sk->copied_seq = sk->acked_seq;
1042 if(!sk->dead)
1043 sk->state_change(sk);
1044 if(sk->max_window==0)
1045 {1046 sk->max_window=32; /* Sanity check */1047 sk->mss=min(sk->max_window,sk->mtu);
1048 }1049 }1050
1051 /*1052 * I make no guarantees about the first clause in the following1053 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under1054 * what conditions "!flag" would be true. However I think the rest1055 * of the conditions would prevent that from causing any1056 * unnecessary retransmission. 1057 * Clearly if the first packet has expired it should be 1058 * retransmitted. The other alternative, "flag&2 && retransmits", is1059 * harder to explain: You have to look carefully at how and when the1060 * timer is set and with what timeout. The most recent transmission always1061 * sets the timer. So in general if the most recent thing has timed1062 * out, everything before it has as well. So we want to go ahead and1063 * retransmit some more. If we didn't explicitly test for this1064 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"1065 * would not be true. If you look at the pattern of timing, you can1066 * show that rto is increased fast enough that the next packet would1067 * almost never be retransmitted immediately. Then you'd end up1068 * waiting for a timeout to send each packet on the retransmission1069 * queue. With my implementation of the Karn sampling algorithm,1070 * the timeout would double each time. The net result is that it would1071 * take a hideous amount of time to recover from a single dropped packet.1072 * It's possible that there should also be a test for TIME_WRITE, but1073 * I think as long as "send_head != NULL" and "retransmit" is on, we've1074 * got to be in real retransmission mode.1075 * Note that tcp_do_retransmit is called with all==1. Setting cong_window1076 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.1077 * As long as no further losses occur, this seems reasonable.1078 */1079
1080 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1081 (((flag&2) && sk->retransmits) ||
1082 (sk->send_head->when + sk->rto < jiffies)))
1083 {1084 if(sk->send_head->when + sk->rto < jiffies)
1085 tcp_retransmit(sk,0);
1086 else1087 {1088 tcp_do_retransmit(sk, 1);
1089 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1090 }1091 }1092
1093 return 1;
1094
1095 uninteresting_ack:
1096 if(sk->debug)
1097 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1098
1099 /*1100 * Keepalive processing.1101 */1102
1103 if (after(ack, sk->sent_seq))
1104 {1105 return 0;
1106 }1107
1108 /*1109 * Restart the keepalive timer.1110 */1111
1112 if (sk->keepopen)
1113 {1114 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1115 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1116 }1117 return 1;
1118 }1119
1120
1121 /*1122 * Process the FIN bit. This now behaves as it is supposed to work1123 * and the FIN takes effect when it is validly part of sequence1124 * space. Not before when we get holes.1125 *1126 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT1127 * (and thence onto LAST-ACK and finally, CLOSE, we never enter1128 * TIME-WAIT)1129 *1130 * If we are in FINWAIT-1, a received FIN indicates simultaneous1131 * close and we go into CLOSING (and later onto TIME-WAIT)1132 *1133 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.1134 *1135 */1136
1137 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */1138 {1139 sk->fin_seq = skb->end_seq;
1140
1141 if (!sk->dead)
1142 {1143 sk->state_change(sk);
1144 sock_wake_async(sk->socket, 1);
1145 }1146
1147 switch(sk->state)
1148 {1149 caseTCP_SYN_RECV:
1150 caseTCP_SYN_SENT:
1151 caseTCP_ESTABLISHED:
1152 /*1153 * move to CLOSE_WAIT, tcp_data() already handled1154 * sending the ack.1155 */1156 tcp_set_state(sk,TCP_CLOSE_WAIT);
1157 if (th->rst)
1158 sk->shutdown = SHUTDOWN_MASK;
1159 break;
1160
1161 caseTCP_CLOSE_WAIT:
1162 caseTCP_CLOSING:
1163 /*1164 * received a retransmission of the FIN, do1165 * nothing.1166 */1167 break;
1168 caseTCP_TIME_WAIT:
1169 /*1170 * received a retransmission of the FIN,1171 * restart the TIME_WAIT timer.1172 */1173 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1174 return(0);
1175 caseTCP_FIN_WAIT1:
1176 /*1177 * This case occurs when a simultaneous close1178 * happens, we must ack the received FIN and1179 * enter the CLOSING state.1180 *1181 * This causes a WRITE timeout, which will either1182 * move on to TIME_WAIT when we timeout, or resend1183 * the FIN properly (maybe we get rid of that annoying1184 * FIN lost hang). The TIME_WRITE code is already correct1185 * for handling this timeout.1186 */1187
1188 if(sk->ip_xmit_timeout != TIME_WRITE)
1189 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1190 tcp_set_state(sk,TCP_CLOSING);
1191 break;
1192 caseTCP_FIN_WAIT2:
1193 /*1194 * received a FIN -- send ACK and enter TIME_WAIT1195 */1196 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1197 sk->shutdown|=SHUTDOWN_MASK;
1198 tcp_set_state(sk,TCP_TIME_WAIT);
1199 break;
1200 caseTCP_CLOSE:
1201 /*1202 * already in CLOSE1203 */1204 break;
1205 default:
1206 tcp_set_state(sk,TCP_LAST_ACK);
1207
1208 /* Start the timers. */1209 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1210 return(0);
1211 }1212
1213 return(0);
1214 }1215
1216 /*1217 * Add a sk_buff to the TCP receive queue, calculating1218 * the ACK sequence as we go..1219 */1220 staticinlinevoidtcp_insert_skb(structsk_buff * skb, structsk_buff_head * list)
/* */1221 {1222 structsk_buff * prev, * next;
1223 u32seq;
1224
1225 /*1226 * Find where the new skb goes.. (This goes backwards,1227 * on the assumption that we get the packets in order)1228 */1229 seq = skb->seq;
1230 prev = list->prev;
1231 next = (structsk_buff *) list;
1232 for (;;) {1233 if (prev == (structsk_buff *) list || !after(prev->seq, seq))
1234 break;
1235 next = prev;
1236 prev = prev->prev;
1237 }1238 __skb_insert(skb, prev, next, list);
1239 }1240
1241 /*1242 * Called for each packet when we find a new ACK endpoint sequence in it1243 */1244 staticinlineu32tcp_queue_ack(structsk_buff * skb, structsock * sk)
/* */1245 {1246 /*1247 * When we ack the fin, we do the FIN 1248 * processing.1249 */1250 skb->acked = 1;
1251 if (skb->h.th->fin)
1252 tcp_fin(skb,sk,skb->h.th);
1253 returnskb->end_seq;
1254 }1255
1256 staticvoidtcp_queue(structsk_buff * skb, structsock * sk, structtcphdr *th)
/* */1257 {1258 u32ack_seq;
1259
1260 tcp_insert_skb(skb, &sk->receive_queue);
1261
1262 /*1263 * Did we get anything new to ack?1264 */1265 ack_seq = sk->acked_seq;
1266
1267
1268 if (!after(skb->seq, ack_seq)) {1269 if (after(skb->end_seq, ack_seq)) {1270 /* the packet straddles our window end */1271 structsk_buff_head * list = &sk->receive_queue;
1272 structsk_buff * next;
1273 ack_seq = tcp_queue_ack(skb, sk);
1274
1275 /*1276 * Do we have any old packets to ack that the above1277 * made visible? (Go forward from skb)1278 */1279 next = skb->next;
1280 while (next != (structsk_buff *) list) {1281 if (after(next->seq, ack_seq))
1282 break;
1283 if (after(next->end_seq, ack_seq))
1284 ack_seq = tcp_queue_ack(next, sk);
1285 next = next->next;
1286 }1287
1288 /*1289 * Ok, we found new data, update acked_seq as1290 * necessary (and possibly send the actual1291 * ACK packet).1292 */1293 sk->acked_seq = ack_seq;
1294
1295 }else{1296 if (sk->debug)
1297 printk("Ack duplicate packet.\n");
1298 tcp_send_ack(sk);
1299 return;
1300 }1301
1302
1303 /*1304 * Delay the ack if possible. Send ack's to1305 * fin frames immediately as there shouldn't be1306 * anything more to come.1307 */1308 if (!sk->delay_acks || th->fin) {1309 tcp_send_ack(sk);
1310 }else{1311 /*1312 * If psh is set we assume it's an1313 * interactive session that wants quick1314 * acks to avoid nagling too much. 1315 */1316 intdelay = HZ/2;
1317 if (th->psh)
1318 delay = HZ/50;
1319 tcp_send_delayed_ack(sk, delay);
1320 }1321
1322 /*1323 * Tell the user we have some more data.1324 */1325
1326 if (!sk->dead)
1327 sk->data_ready(sk,0);
1328
1329 }1330 else1331 {1332 /*1333 * If we've missed a packet, send an ack.1334 * Also start a timer to send another.1335 *1336 * 4.3reno machines look for these kind of acks so1337 * they can do fast recovery. Three identical 'old'1338 * acks lets it know that one frame has been lost1339 * and should be resent. Because this is before the1340 * whole window of data has timed out it can take1341 * one lost frame per window without stalling.1342 * [See Jacobson RFC1323, Stevens TCP/IP illus vol2]1343 *1344 * We also should be spotting triple bad sequences.1345 * [We now do this.]1346 *1347 */1348
1349 if (!skb->acked)
1350 {1351 if(sk->debug)
1352 printk("Ack past end of seq packet.\n");
1353 tcp_send_ack(sk);
1354 tcp_send_delayed_ack(sk,HZ/2);
1355 }1356 }1357 }1358
1359
1360 /*1361 * This routine handles the data. If there is room in the buffer,1362 * it will be have already been moved into it. If there is no1363 * room, then we will just have to discard the packet.1364 */1365
1366 staticinttcp_data(structsk_buff *skb, structsock *sk,
/* */1367 unsignedlongsaddr, unsignedintlen)
1368 {1369 structtcphdr *th;
1370 u32new_seq, shut_seq;
1371
1372 th = skb->h.th;
1373 skb_pull(skb,th->doff*4);
1374 skb_trim(skb,len-(th->doff*4));
1375
1376 /*1377 * The bytes in the receive read/assembly queue has increased. Needed for the1378 * low memory discard algorithm 1379 */1380
1381 sk->bytes_rcv += skb->len;
1382
1383 if (skb->len == 0 && !th->fin)
1384 {1385 /* 1386 * Don't want to keep passing ack's back and forth. 1387 * (someone sent us dataless, boring frame)1388 */1389 if (!th->ack)
1390 tcp_send_ack(sk);
1391 kfree_skb(skb, FREE_READ);
1392 return(0);
1393 }1394
1395 /*1396 * We no longer have anyone receiving data on this connection.1397 */1398
1399 #ifndef TCP_DONT_RST_SHUTDOWN
1400
1401 if(sk->shutdown & RCV_SHUTDOWN)
1402 {1403 /*1404 * FIXME: BSD has some magic to avoid sending resets to1405 * broken 4.2 BSD keepalives. Much to my surprise a few non1406 * BSD stacks still have broken keepalives so we want to1407 * cope with it.1408 */1409
1410 if(skb->len) /* We don't care if it's just an ack or1411 a keepalive/window probe */1412 {1413 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */1414
1415 /* Do this the way 4.4BSD treats it. Not what I'd1416 regard as the meaning of the spec but it's what BSD1417 does and clearly they know everything 8) */1418
1419 /*1420 * This is valid because of two things1421 *1422 * a) The way tcp_data behaves at the bottom.1423 * b) A fin takes effect when read not when received.1424 */1425
1426 shut_seq = sk->acked_seq+1; /* Last byte */1427
1428 if(after(new_seq,shut_seq))
1429 {1430 if(sk->debug)
1431 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1432 sk, new_seq, shut_seq, sk->blog);
1433 if(sk->dead)
1434 {1435 sk->acked_seq = new_seq + th->fin;
1436 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1437 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1438 tcp_statistics.TcpEstabResets++;
1439 sk->err = EPIPE;
1440 sk->error_report(sk);
1441 sk->shutdown = SHUTDOWN_MASK;
1442 tcp_set_state(sk,TCP_CLOSE);
1443 kfree_skb(skb, FREE_READ);
1444 return 0;
1445 }1446 }1447 }1448 }1449
1450 #endif1451
1452 tcp_queue(skb, sk, th);
1453
1454 return(0);
1455 }1456
1457
1458 /*1459 * This routine is only called when we have urgent data1460 * signalled. Its the 'slow' part of tcp_urg. It could be1461 * moved inline now as tcp_urg is only called from one1462 * place. We handle URGent data wrong. We have to - as1463 * BSD still doesn't use the correction from RFC961.1464 */1465
1466 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */1467 {1468 u32ptr = ntohs(th->urg_ptr);
1469
1470 if (ptr)
1471 ptr--;
1472 ptr += ntohl(th->seq);
1473
1474 /* ignore urgent data that we've already seen and read */1475 if (after(sk->copied_seq, ptr))
1476 return;
1477
1478 /* do we already have a newer (or duplicate) urgent pointer? */1479 if (sk->urg_data && !after(ptr, sk->urg_seq))
1480 return;
1481
1482 /* tell the world about our new urgent pointer */1483 if (sk->proc != 0) {1484 if (sk->proc > 0) {1485 kill_proc(sk->proc, SIGURG, 1);
1486 }else{1487 kill_pg(-sk->proc, SIGURG, 1);
1488 }1489 }1490 sk->urg_data = URG_NOTYET;
1491 sk->urg_seq = ptr;
1492 }1493
1494 /*1495 * This is the 'fast' part of urgent handling.1496 */1497
1498 staticinlinevoidtcp_urg(structsock *sk, structtcphdr *th, unsignedlonglen)
/* */1499 {1500 /*1501 * Check if we get a new urgent pointer - normally not 1502 */1503
1504 if (th->urg)
1505 tcp_check_urg(sk,th);
1506
1507 /*1508 * Do we wait for any urgent data? - normally not1509 */1510
1511 if (sk->urg_data == URG_NOTYET) {1512 u32ptr;
1513
1514 /*1515 * Is the urgent pointer pointing into this packet? 1516 */1517 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1518 if (ptr < len) {1519 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
1520 if (!sk->dead)
1521 sk->data_ready(sk,0);
1522 }1523 }1524 }1525
1526 /*1527 * This should be a bit smarter and remove partially1528 * overlapping stuff too, but this should be good1529 * enough for any even remotely normal case (and the1530 * worst that can happen is that we have a few1531 * unnecessary packets in the receive queue).1532 *1533 * This function is never called with an empty list..1534 */1535 staticinlinevoidtcp_remove_dups(structsk_buff_head * list)
/* */1536 {1537 structsk_buff * next = list->next;
1538
1539 for (;;) {1540 structsk_buff * skb = next;
1541 next = next->next;
1542 if (next == (structsk_buff *) list)
1543 break;
1544 if (before(next->end_seq, skb->end_seq)) {1545 __skb_unlink(next, list);
1546 kfree_skb(next, FREE_READ);
1547 next = skb;
1548 continue;
1549 }1550 if (next->seq != skb->seq)
1551 continue;
1552 __skb_unlink(skb, list);
1553 kfree_skb(skb, FREE_READ);
1554 }1555 }1556
1557 /*1558 * Throw out all unnecessary packets: we've gone over the1559 * receive queue limit. This shouldn't happen in a normal1560 * TCP connection, but we might have gotten duplicates etc.1561 */1562 staticvoidprune_queue(structsk_buff_head * list)
/* */1563 {1564 for (;;) {1565 structsk_buff * skb = list->prev;
1566
1567 /* gone through it all? */1568 if (skb == (structsk_buff *) list)
1569 break;
1570 if (!skb->acked) {1571 __skb_unlink(skb, list);
1572 kfree_skb(skb, FREE_READ);
1573 continue;
1574 }1575 tcp_remove_dups(list);
1576 break;
1577 }1578 }1579
1580 /*1581 * A TCP packet has arrived.1582 * skb->h.raw is the TCP header.1583 */1584
1585 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */1586 __u32daddr, unsignedshortlen,
1587 __u32saddr, intredo, structinet_protocol * protocol)
1588 {1589 structtcphdr *th;
1590 structsock *sk;
1591 intsyn_ok=0;
1592
1593 /*1594 * "redo" is 1 if we have already seen this skb but couldn't1595 * use it at that time (the socket was locked). In that case1596 * we have already done a lot of the work (looked up the socket1597 * etc).1598 */1599 th = skb->h.th;
1600 sk = skb->sk;
1601 if (!redo) {1602 tcp_statistics.TcpInSegs++;
1603 if (skb->pkt_type!=PACKET_HOST)
1604 gotodiscard_it;
1605
1606 /*1607 * Pull up the IP header.1608 */1609
1610 skb_pull(skb, skb->h.raw-skb->data);
1611
1612 /*1613 * Try to use the device checksum if provided.1614 */1615 switch (skb->ip_summed)
1616 {1617 caseCHECKSUM_NONE:
1618 skb->csum = csum_partial((char *)th, len, 0);
1619 caseCHECKSUM_HW:
1620 if (tcp_check(th, len, saddr, daddr, skb->csum))
1621 gotodiscard_it;
1622 default:
1623 /* CHECKSUM_UNNECESSARY */1624 }1625 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1626 if (!sk)
1627 gotono_tcp_socket;
1628 skb->sk = sk;
1629 skb->seq = ntohl(th->seq);
1630 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1631 skb->ack_seq = ntohl(th->ack_seq);
1632
1633 skb->acked = 0;
1634 skb->used = 0;
1635 skb->free = 1;
1636 skb->saddr = daddr;
1637 skb->daddr = saddr;
1638
1639 /*1640 * We may need to add it to the backlog here. 1641 */1642 if (sk->users)
1643 {1644 __skb_queue_tail(&sk->back_log, skb);
1645 return(0);
1646 }1647 }1648
1649 /*1650 * If this socket has got a reset it's to all intents and purposes 1651 * really dead. Count closed sockets as dead.1652 *1653 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD1654 * simply drops data. This seems incorrect as a 'closed' TCP doesn't1655 * exist so should cause resets as if the port was unreachable.1656 */1657
1658 if (sk->zapped || sk->state==TCP_CLOSE)
1659 gotono_tcp_socket;
1660
1661 if (!sk->prot)
1662 {1663 printk("IMPOSSIBLE 3\n");
1664 return(0);
1665 }1666
1667
1668 /*1669 * Charge the memory to the socket. 1670 */1671
1672 skb->sk=sk;
1673 atomic_add(skb->truesize, &sk->rmem_alloc);
1674
1675 /*1676 * We should now do header prediction.1677 */1678
1679 /*1680 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We1681 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug1682 * compatibility. We also set up variables more thoroughly [Karn notes in the1683 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].1684 */1685
1686 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */1687 {1688
1689 /*1690 * Now deal with unusual cases.1691 */1692
1693 if(sk->state==TCP_LISTEN)
1694 {1695 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */1696 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1697
1698 /*1699 * We don't care for RST, and non SYN are absorbed (old segments)1700 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the1701 * netmask on a running connection it can go broadcast. Even Sun's have1702 * this problem so I'm ignoring it 1703 */1704
1705 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1706 {1707 kfree_skb(skb, FREE_READ);
1708 return 0;
1709 }1710
1711 /* 1712 * Guess we need to make a new socket up 1713 */1714
1715 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1716
1717 /*1718 * Now we have several options: In theory there is nothing else1719 * in the frame. KA9Q has an option to send data with the syn,1720 * BSD accepts data with the syn up to the [to be] advertised window1721 * and Solaris 2.1 gives you a protocol error. For now we just ignore1722 * it, that fits the spec precisely and avoids incompatibilities. It1723 * would be nice in future to drop through and process the data.1724 *1725 * Now TTCP is starting to use we ought to queue this data.1726 */1727
1728 return 0;
1729 }1730
1731 /* 1732 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN1733 * then its a new connection1734 */1735
1736 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1737 {1738 kfree_skb(skb, FREE_READ);
1739 return 0;
1740 }1741
1742 /*1743 * SYN sent means we have to look for a suitable ack and either reset1744 * for bad matches or go to connected. The SYN_SENT case is unusual and should1745 * not be in line code. [AC]1746 */1747
1748 if(sk->state==TCP_SYN_SENT)
1749 {1750 /* Crossed SYN or previous junk segment */1751 if(th->ack)
1752 {1753 /* We got an ack, but it's not a good ack */1754 if(!tcp_ack(sk,th,skb->ack_seq,len))
1755 {1756 /* Reset the ack - its an ack from a 1757 different connection [ th->rst is checked in tcp_send_reset()] */1758 tcp_statistics.TcpAttemptFails++;
1759 tcp_send_reset(daddr, saddr, th,
1760 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1761 kfree_skb(skb, FREE_READ);
1762 return(0);
1763 }1764 if(th->rst)
1765 returntcp_reset(sk,skb);
1766 if(!th->syn)
1767 {1768 /* A valid ack from a different connection1769 start. Shouldn't happen but cover it */1770 tcp_statistics.TcpAttemptFails++;
1771 tcp_send_reset(daddr, saddr, th,
1772 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1773 kfree_skb(skb, FREE_READ);
1774 return 0;
1775 }1776 /*1777 * Ok.. it's good. Set up sequence numbers and1778 * move to established.1779 */1780 syn_ok=1; /* Don't reset this connection for the syn */1781 sk->acked_seq = skb->seq+1;
1782 sk->lastwin_seq = skb->seq+1;
1783 sk->fin_seq = skb->seq;
1784 tcp_send_ack(sk);
1785 tcp_set_state(sk, TCP_ESTABLISHED);
1786 tcp_options(sk,th);
1787 sk->dummy_th.dest=th->source;
1788 sk->copied_seq = sk->acked_seq;
1789 if(!sk->dead)
1790 {1791 sk->state_change(sk);
1792 sock_wake_async(sk->socket, 0);
1793 }1794 if(sk->max_window==0)
1795 {1796 sk->max_window = 32;
1797 sk->mss = min(sk->max_window, sk->mtu);
1798 }1799 }1800 else1801 {1802 /* See if SYN's cross. Drop if boring */1803 if(th->syn && !th->rst)
1804 {1805 /* Crossed SYN's are fine - but talking to1806 yourself is right out... */1807 if(sk->saddr==saddr && sk->daddr==daddr &&
1808 sk->dummy_th.source==th->source &&
1809 sk->dummy_th.dest==th->dest)
1810 {1811 tcp_statistics.TcpAttemptFails++;
1812 returntcp_reset(sk,skb);
1813 }1814 tcp_set_state(sk,TCP_SYN_RECV);
1815
1816 /*1817 * FIXME:1818 * Must send SYN|ACK here1819 */1820 }1821 /* Discard junk segment */1822 kfree_skb(skb, FREE_READ);
1823 return 0;
1824 }1825 /*1826 * SYN_RECV with data maybe.. drop through1827 */1828 gotorfc_step6;
1829 }1830
1831 /*1832 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is1833 * a more complex suggestion for fixing these reuse issues in RFC16441834 * but not yet ready for general use. Also see RFC1379.1835 *1836 * Note the funny way we go back to the top of this function for1837 * this case ("goto try_next_socket"). That also takes care of1838 * checking "sk->users" for the new socket as well as doing all1839 * the normal tests on the packet.1840 */1841
1842 #defineBSD_TIME_WAIT1843 #ifdefBSD_TIME_WAIT1844 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1845 after(skb->seq, sk->acked_seq) && !th->rst)
1846 {1847 u32seq = sk->write_seq;
1848 if(sk->debug)
1849 printk("Doing a BSD time wait\n");
1850 tcp_statistics.TcpEstabResets++;
1851 atomic_sub(skb->truesize, &sk->rmem_alloc);
1852 skb->sk = NULL;
1853 sk->err=ECONNRESET;
1854 tcp_set_state(sk, TCP_CLOSE);
1855 sk->shutdown = SHUTDOWN_MASK;
1856 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1857 /* this is not really correct: we should check sk->users */1858 if (sk && sk->state==TCP_LISTEN)
1859 {1860 skb->sk = sk;
1861 atomic_add(skb->truesize, &sk->rmem_alloc);
1862 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1863 return 0;
1864 }1865 kfree_skb(skb, FREE_READ);
1866 return 0;
1867 }1868 #endif1869 }1870
1871 /*1872 * We are now in normal data flow (see the step list in the RFC)1873 * Note most of these are inline now. I'll inline the lot when1874 * I have time to test it hard and look at what gcc outputs 1875 */1876
1877 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1878 {1879 bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
1880 kfree_skb(skb, FREE_READ);
1881 return 0;
1882 }1883
1884 if(th->rst)
1885 returntcp_reset(sk,skb);
1886
1887 /*1888 * !syn_ok is effectively the state test in RFC793.1889 */1890
1891 if(th->syn && !syn_ok)
1892 {1893 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1894 returntcp_reset(sk,skb);
1895 }1896
1897 tcp_delack_estimator(sk);
1898
1899 /*1900 * Process the ACK1901 */1902
1903
1904 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1905 {1906 /*1907 * Our three way handshake failed.1908 */1909
1910 if(sk->state==TCP_SYN_RECV)
1911 {1912 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1913 }1914 kfree_skb(skb, FREE_READ);
1915 return 0;
1916 }1917
1918 rfc_step6: /* I'll clean this up later */1919
1920 /*1921 * If the accepted buffer put us over our queue size we1922 * now drop it (we must process the ack first to avoid1923 * deadlock cases).1924 */1925
1926 /*1927 * Process urgent data1928 */1929
1930 tcp_urg(sk, th, len);
1931
1932 /*1933 * Process the encapsulated data1934 */1935
1936 if(tcp_data(skb,sk, saddr, len))
1937 kfree_skb(skb, FREE_READ);
1938
1939 /*1940 * If our receive queue has grown past its limits,1941 * try to prune away duplicates etc..1942 */1943 if (sk->rmem_alloc > sk->rcvbuf)
1944 prune_queue(&sk->receive_queue);
1945
1946 /*1947 * And done1948 */1949
1950 return 0;
1951
1952 no_tcp_socket:
1953 /*1954 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)1955 */1956 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1957
1958 discard_it:
1959 /*1960 * Discard frame1961 */1962 skb->sk = NULL;
1963 kfree_skb(skb, FREE_READ);
1964 return 0;
1965 }