1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp_input.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * FIXES 23 * Pedro Roque : Double ACK bug 24 */ 25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 /* 30 * Policy code extracted so its now separate 31 */ 32
33 /* 34 * Called each time to estimate the delayed ack timeout. This is 35 * how it should be done so a fast link isn't impacted by ack delay. 36 */ 37
38 extern__inline__voidtcp_delack_estimator(structsock *sk)
/* */ 39 { 40 /* 41 * Delayed ACK time estimator. 42 */ 43
44 if (sk->lrcvtime == 0)
45 { 46 sk->lrcvtime = jiffies;
47 sk->ato = HZ/3;
48 } 49 else 50 { 51 intm;
52
53 m = jiffies - sk->lrcvtime;
54
55 sk->lrcvtime = jiffies;
56
57 if (m <= 0)
58 m = 1;
59
60 if (m > (sk->rtt >> 3))
61 { 62 sk->ato = sk->rtt >> 3;
63 /* 64 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); 65 */ 66 } 67 else 68 { 69 sk->ato = (sk->ato >> 1) + m;
70 /* 71 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); 72 */ 73 } 74 } 75 } 76
77 /* 78 * Called on frames that were known _not_ to have been 79 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 80 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. 81 */ 82
83 extern__inline__voidtcp_rtt_estimator(structsock *sk, structsk_buff *oskb)
/* */ 84 { 85 longm;
86 /* 87 * The following amusing code comes from Jacobson's 88 * article in SIGCOMM '88. Note that rtt and mdev 89 * are scaled versions of rtt and mean deviation. 90 * This is designed to be as fast as possible 91 * m stands for "measurement". 92 */ 93
94 m = jiffies - oskb->when; /* RTT */ 95 if(m<=0)
96 m=1; /* IS THIS RIGHT FOR <0 ??? */ 97 m -= (sk->rtt >> 3); /* m is now error in rtt est */ 98 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ 99 if (m < 0)
100 m = -m; /* m is now abs(error) */ 101 m -= (sk->mdev >> 2); /* similar update on mdev */ 102 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 103
104 /* 105 * Now update timeout. Note that this removes any backoff. 106 */ 107
108 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
109 if (sk->rto > 120*HZ)
110 sk->rto = 120*HZ;
111 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ 112 sk->rto = HZ/5;
113 sk->backoff = 0;
114 } 115
116 /* 117 * Cached last hit socket 118 */ 119
120 staticvolatileunsignedlongth_cache_saddr, th_cache_daddr;
121 staticvolatileunsignedshortth_cache_dport, th_cache_sport;
122 staticvolatilestructsock *th_cache_sk;
123
124 voidtcp_cache_zap(void)
/* */ 125 { 126 th_cache_sk=NULL;
127 } 128
129 /* 130 * Find the socket, using the last hit cache if applicable. The cache is not quite 131 * right... 132 */ 133
134 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */ 135 { 136 structsock * sk;
137
138 sk = (structsock *) th_cache_sk;
139 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
140 sport != th_cache_sport || dport != th_cache_dport) { 141 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
142 if (sk) { 143 th_cache_saddr=saddr;
144 th_cache_daddr=daddr;
145 th_cache_dport=dport;
146 th_cache_sport=sport;
147 th_cache_sk=sk;
148 } 149 } 150 returnsk;
151 } 152
153 /* 154 * React to a out-of-window TCP sequence number in an incoming packet 155 */ 156
157 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, u32end_seq,
/* */ 158 structdevice *dev)
159 { 160 if (th->rst)
161 return;
162
163 /* 164 * Send a reset if we get something not ours and we are 165 * unsynchronized. Note: We don't do anything to our end. We 166 * are just killing the bogus remote connection then we will 167 * connect again and it will work (with luck). 168 */ 169
170 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
171 { 172 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
173 return;
174 } 175
176 /* 177 * We got out of sequence data. 178 * This turns out to be tricky. If the packet ends at the 179 * edge of the window, then we MUST ack the packet, 180 * otherwise a lost ACK packet can stall the TCP. 181 * We deal with this case in tcp_queue(). 182 * On the other hand, if the packet is further to the 183 * left of the window, then we are looking a retransmitted 184 * packet. If we ACK it we can get into a situation that 185 * will later induce a fast retransmit of another packet. 186 * This can end up eating up half our bandwidth. 187 */ 188
189 /* This case is NOT supposed to be able 190 * to happen. Test for it? 191 */ 192 if (sk->acked_seq == end_seq)
193 printk("Impossible out of sequence data case.\n");
194 return;
195 } 196
197 /* 198 * This functions checks to see if the tcp header is actually acceptable. 199 */ 200
201 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */ 202 { 203 u32end_window = sk->acked_seq + sk->window;
204 return/* if start is at end of window, end must be too (zero window) */ 205 (seq == end_window && seq == end_seq) ||
206 /* if start is before end of window, check for interest */ 207 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
208 } 209
210 /* 211 * When we get a reset we do this. This probably is a tcp_output routine 212 * really. 213 */ 214
215 staticinttcp_reset(structsock *sk, structsk_buff *skb)
/* */ 216 { 217 sk->zapped = 1;
218 /* 219 * We want the right error as BSD sees it (and indeed as we do). 220 */ 221 sk->err = ECONNRESET;
222 if (sk->state == TCP_SYN_SENT)
223 sk->err = ECONNREFUSED;
224 if (sk->state == TCP_CLOSE_WAIT)
225 sk->err = EPIPE;
226 #ifdef CONFIG_TCP_RFC1337
227 /* 228 * Time wait assassination protection [RFC1337] 229 * 230 * This is a good idea, but causes more sockets to take time to close. 231 * 232 * Ian Heavens has since shown this is an inadequate fix for the protocol 233 * bug in question. 234 */ 235 if(sk->state!=TCP_TIME_WAIT)
236 { 237 tcp_set_state(sk,TCP_CLOSE);
238 sk->shutdown = SHUTDOWN_MASK;
239 } 240 #else 241 tcp_set_state(sk,TCP_CLOSE);
242 sk->shutdown = SHUTDOWN_MASK;
243 #endif 244 if (!sk->dead)
245 sk->state_change(sk);
246 kfree_skb(skb, FREE_READ);
247 return(0);
248 } 249
250
251 /* 252 * Look for tcp options. Parses everything but only knows about MSS. 253 * This routine is always called with the packet containing the SYN. 254 * However it may also be called with the ack to the SYN. So you 255 * can't assume this is always the SYN. It's always called after 256 * we have set up sk->mtu to our own MTU. 257 * 258 * We need at minimum to add PAWS support here. Possibly large windows 259 * as Linux gets deployed on 100Mb/sec networks. 260 */ 261
262 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */ 263 { 264 unsignedchar *ptr;
265 intlength=(th->doff*4)-sizeof(structtcphdr);
266 intmss_seen = 0;
267
268 ptr = (unsignedchar *)(th + 1);
269
270 while(length>0)
271 { 272 intopcode=*ptr++;
273 intopsize=*ptr++;
274 switch(opcode)
275 { 276 caseTCPOPT_EOL:
277 return;
278 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 279 length--;
280 ptr--; /* the opsize=*ptr++ above was a mistake */ 281 continue;
282
283 default:
284 if(opsize<=2) /* Avoid silly options looping forever */ 285 return;
286 switch(opcode)
287 { 288 caseTCPOPT_MSS:
289 if(opsize==4 && th->syn)
290 { 291 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
292 mss_seen = 1;
293 } 294 break;
295 /* Add other options here as people feel the urge to implement stuff like large windows */ 296 } 297 ptr+=opsize-2;
298 length-=opsize;
299 } 300 } 301 if (th->syn)
302 { 303 if (! mss_seen)
304 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ 305 } 306 #ifdefCONFIG_INET_PCTCP 307 sk->mss = min(sk->max_window >> 1, sk->mtu);
308 #else 309 sk->mss = min(sk->max_window, sk->mtu);
310 sk->max_unacked = 2 * sk->mss;
311 #endif 312 } 313
314
315 /* 316 * This routine handles a connection request. 317 * It should make sure we haven't already responded. 318 * Because of the way BSD works, we have to send a syn/ack now. 319 * This also means it will be harder to close a socket which is 320 * listening. 321 */ 322
323 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */ 324 u32daddr, u32saddr, structoptions *opt, structdevice *dev, u32seq)
325 { 326 structsock *newsk;
327 structtcphdr *th;
328 structrtable *rt;
329
330 th = skb->h.th;
331
332 /* If the socket is dead, don't accept the connection. */ 333 if (!sk->dead)
334 { 335 sk->data_ready(sk,0);
336 } 337 else 338 { 339 if(sk->debug)
340 printk("Reset on %p: Connect on dead socket.\n",sk);
341 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
342 tcp_statistics.TcpAttemptFails++;
343 kfree_skb(skb, FREE_READ);
344 return;
345 } 346
347 /* 348 * Make sure we can accept more. This will prevent a 349 * flurry of syns from eating up all our memory. 350 * 351 * BSD does some funnies here and allows 3/2 times the 352 * set backlog as a fudge factor. Thats just too gross. 353 */ 354
355 if (sk->ack_backlog >= sk->max_ack_backlog)
356 { 357 tcp_statistics.TcpAttemptFails++;
358 kfree_skb(skb, FREE_READ);
359 return;
360 } 361
362 /* 363 * We need to build a new sock struct. 364 * It is sort of bad to have a socket without an inode attached 365 * to it, but the wake_up's will just wake up the listening socket, 366 * and if the listening socket is destroyed before this is taken 367 * off of the queue, this will take care of it. 368 */ 369
370 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
371 if (newsk == NULL)
372 { 373 /* just ignore the syn. It will get retransmitted. */ 374 tcp_statistics.TcpAttemptFails++;
375 kfree_skb(skb, FREE_READ);
376 return;
377 } 378
379 memcpy(newsk, sk, sizeof(*newsk));
380 newsk->opt = NULL;
381 newsk->ip_route_cache = NULL;
382 if (opt && opt->optlen)
383 { 384 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
385 if (!sk->opt)
386 { 387 kfree_s(newsk, sizeof(structsock));
388 tcp_statistics.TcpAttemptFails++;
389 kfree_skb(skb, FREE_READ);
390 return;
391 } 392 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
393 { 394 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
395 kfree_s(newsk, sizeof(structsock));
396 tcp_statistics.TcpAttemptFails++;
397 kfree_skb(skb, FREE_READ);
398 return;
399 } 400 } 401 skb_queue_head_init(&newsk->write_queue);
402 skb_queue_head_init(&newsk->receive_queue);
403 newsk->send_head = NULL;
404 newsk->send_tail = NULL;
405 skb_queue_head_init(&newsk->back_log);
406 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ 407 newsk->rto = TCP_TIMEOUT_INIT;
408 newsk->mdev = 0;
409 newsk->max_window = 0;
410 newsk->cong_window = 1;
411 newsk->cong_count = 0;
412 newsk->ssthresh = 0;
413 newsk->backoff = 0;
414 newsk->blog = 0;
415 newsk->intr = 0;
416 newsk->proc = 0;
417 newsk->done = 0;
418 newsk->partial = NULL;
419 newsk->pair = NULL;
420 newsk->wmem_alloc = 0;
421 newsk->rmem_alloc = 0;
422 newsk->localroute = sk->localroute;
423
424 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
425
426 newsk->err = 0;
427 newsk->shutdown = 0;
428 newsk->ack_backlog = 0;
429 newsk->acked_seq = skb->seq+1;
430 newsk->lastwin_seq = skb->seq+1;
431 newsk->delay_acks = 1;
432 newsk->copied_seq = skb->seq+1;
433 newsk->fin_seq = skb->seq;
434 newsk->state = TCP_SYN_RECV;
435 newsk->timeout = 0;
436 newsk->ip_xmit_timeout = 0;
437 newsk->write_seq = seq;
438 newsk->window_seq = newsk->write_seq;
439 newsk->rcv_ack_seq = newsk->write_seq;
440 newsk->urg_data = 0;
441 newsk->retransmits = 0;
442 newsk->linger=0;
443 newsk->destroy = 0;
444 init_timer(&newsk->timer);
445 newsk->timer.data = (unsignedlong)newsk;
446 newsk->timer.function = &net_timer;
447 init_timer(&newsk->delack_timer);
448 newsk->delack_timer.data = (unsignedlong)newsk;
449 newsk->delack_timer.function = tcp_delack_timer;
450 init_timer(&newsk->retransmit_timer);
451 newsk->retransmit_timer.data = (unsignedlong)newsk;
452 newsk->retransmit_timer.function = tcp_retransmit_timer;
453 newsk->dummy_th.source = skb->h.th->dest;
454 newsk->dummy_th.dest = skb->h.th->source;
455
456 /* 457 * Swap these two, they are from our point of view. 458 */ 459
460 newsk->daddr = saddr;
461 newsk->saddr = daddr;
462 newsk->rcv_saddr = daddr;
463
464 put_sock(newsk->num,newsk);
465 newsk->acked_seq = skb->seq + 1;
466 newsk->copied_seq = skb->seq + 1;
467 newsk->socket = NULL;
468
469 /* 470 * Grab the ttl and tos values and use them 471 */ 472
473 newsk->ip_ttl=sk->ip_ttl;
474 newsk->ip_tos=skb->ip_hdr->tos;
475
476 /* 477 * Use 512 or whatever user asked for 478 */ 479
480 /* 481 * Note use of sk->user_mss, since user has no direct access to newsk 482 */ 483
484 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
485 newsk->ip_route_cache = rt;
486
487 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
488 newsk->window_clamp = rt->rt_window;
489 else 490 newsk->window_clamp = 0;
491
492 if (sk->user_mss)
493 newsk->mtu = sk->user_mss;
494 elseif (rt)
495 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
496 else 497 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
498
499 /* 500 * But not bigger than device MTU 501 */ 502
503 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
504
505 #ifdefCONFIG_SKIP 506
507 /* 508 * SKIP devices set their MTU to 65535. This is so they can take packets 509 * unfragmented to security process then fragment. They could lie to the 510 * TCP layer about a suitable MTU, but its easier to let skip sort it out 511 * simply because the final package we want unfragmented is going to be 512 * 513 * [IPHDR][IPSP][Security data][Modified TCP data][Security data] 514 */ 515
516 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ 517 sk->mtu=skip_pick_mtu(sk->mtu,dev);
518 #endif 519 /* 520 * This will min with what arrived in the packet 521 */ 522
523 tcp_options(newsk,skb->h.th);
524
525 tcp_cache_zap();
526 tcp_send_synack(newsk, sk, skb);
527 } 528
529
530 /* 531 * Handle a TCP window that shrunk on us. It shouldn't happen, 532 * but.. 533 * 534 * We may need to move packets from the send queue 535 * to the write queue, if the window has been shrunk on us. 536 * The RFC says you are not allowed to shrink your window 537 * like this, but if the other end does, you must be able 538 * to deal with it. 539 */ 540 voidtcp_window_shrunk(structsock * sk, u32window_seq)
/* */ 541 { 542 structsk_buff *skb;
543 structsk_buff *skb2;
544 structsk_buff *wskb = NULL;
545
546 skb2 = sk->send_head;
547 sk->send_head = NULL;
548 sk->send_tail = NULL;
549
550 /* 551 * This is an artifact of a flawed concept. We want one 552 * queue and a smarter send routine when we send all. 553 */ 554 cli();
555 while (skb2 != NULL)
556 { 557 skb = skb2;
558 skb2 = skb->link3;
559 skb->link3 = NULL;
560 if (after(skb->end_seq, window_seq))
561 { 562 if (sk->packets_out > 0)
563 sk->packets_out--;
564 /* We may need to remove this from the dev send list. */ 565 if (skb->next != NULL)
566 { 567 skb_unlink(skb);
568 } 569 /* Now add it to the write_queue. */ 570 if (wskb == NULL)
571 skb_queue_head(&sk->write_queue,skb);
572 else 573 skb_append(wskb,skb);
574 wskb = skb;
575 } 576 else 577 { 578 if (sk->send_head == NULL)
579 { 580 sk->send_head = skb;
581 sk->send_tail = skb;
582 } 583 else 584 { 585 sk->send_tail->link3 = skb;
586 sk->send_tail = skb;
587 } 588 skb->link3 = NULL;
589 } 590 } 591 sti();
592 } 593
594
595 /* 596 * This routine deals with incoming acks, but not outgoing ones. 597 * 598 * This routine is totally _WRONG_. The list structuring is wrong, 599 * the algorithm is wrong, the code is wrong. 600 */ 601
602 staticinttcp_ack(structsock *sk, structtcphdr *th, u32ack, intlen)
/* */ 603 { 604 intflag = 0;
605 u32window_seq;
606
607 /* 608 * 1 - there was data in packet as well as ack or new data is sent or 609 * in shutdown state 610 * 2 - data from retransmit queue was acked and removed 611 * 4 - window shrunk or data from retransmit queue was acked and removed 612 * 8 - we want to do a fast retransmit. One packet only. 613 */ 614
615 if(sk->zapped)
616 return(1); /* Dead, cant ack any more so why bother */ 617
618 /* 619 * We have dropped back to keepalive timeouts. Thus we have 620 * no retransmits pending. 621 */ 622
623 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
624 sk->retransmits = 0;
625
626 /* 627 * If the ack is newer than sent or older than previous acks 628 * then we can probably ignore it. 629 */ 630
631 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
632 gotouninteresting_ack;
633
634 /* 635 * If there is data set flag 1 636 */ 637
638 if (len != th->doff*4)
639 flag |= 1;
640
641 /* 642 * Have we discovered a larger window 643 */ 644 window_seq = ntohs(th->window);
645 if (window_seq > sk->max_window)
646 { 647 sk->max_window = window_seq;
648 #ifdefCONFIG_INET_PCTCP 649 /* Hack because we don't send partial packets to non SWS 650 handling hosts */ 651 sk->mss = min(window_seq>>1, sk->mtu);
652 #else 653 sk->mss = min(window_seq, sk->mtu);
654 #endif 655 } 656 window_seq += ack;
657
658 /* 659 * See if our window has been shrunk. 660 */ 661 if (after(sk->window_seq, window_seq)) { 662 flag |= 4;
663 tcp_window_shrunk(sk, window_seq);
664 } 665
666 /* 667 * Pipe has emptied 668 */ 669 if (sk->send_tail == NULL || sk->send_head == NULL)
670 { 671 sk->send_head = NULL;
672 sk->send_tail = NULL;
673 sk->packets_out= 0;
674 } 675
676 /* 677 * We don't want too many packets out there. 678 */ 679
680 if (sk->ip_xmit_timeout == TIME_WRITE &&
681 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
682 { 683
684 /* 685 * This is Jacobson's slow start and congestion avoidance. 686 * SIGCOMM '88, p. 328. Because we keep cong_window in integral 687 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 688 * counter and increment it once every cwnd times. It's possible 689 * that this should be done only if sk->retransmits == 0. I'm 690 * interpreting "new data is acked" as including data that has 691 * been retransmitted but is just now being acked. 692 */ 693 if (sk->cong_window < sk->ssthresh)
694 /* 695 * In "safe" area, increase 696 */ 697 sk->cong_window++;
698 else 699 { 700 /* 701 * In dangerous area, increase slowly. In theory this is 702 * sk->cong_window += 1 / sk->cong_window 703 */ 704 if (sk->cong_count >= sk->cong_window)
705 { 706 sk->cong_window++;
707 sk->cong_count = 0;
708 } 709 else 710 sk->cong_count++;
711 } 712 } 713
714 /* 715 * Remember the highest ack received and update the 716 * right hand window edge of the host. 717 * We do a bit of work here to track number of times we've 718 * seen this ack without a change in the right edge of the 719 * window. This will allow us to do fast retransmits. 720 */ 721
722 if (sk->rcv_ack_seq == ack && sk->window_seq == window_seq)
723 { 724 /* 725 * We only want to short cut this once, many 726 * ACKs may still come, we'll do a normal transmit 727 * for these ACKs. 728 */ 729 if (++sk->rcv_ack_cnt == MAX_DUP_ACKS+1)
730 flag |= 8; /* flag for a fast retransmit */ 731 } 732 else 733 { 734 sk->window_seq = window_seq;
735 sk->rcv_ack_seq = ack;
736 sk->rcv_ack_cnt = 1;
737 } 738
739 /* 740 * We passed data and got it acked, remove any soft error 741 * log. Something worked... 742 */ 743
744 sk->err_soft = 0;
745
746 /* 747 * If this ack opens up a zero window, clear backoff. It was 748 * being used to time the probes, and is probably far higher than 749 * it needs to be for normal retransmission. 750 */ 751
752 if (sk->ip_xmit_timeout == TIME_PROBE0)
753 { 754 sk->retransmits = 0; /* Our probe was answered */ 755
756 /* 757 * Was it a usable window open ? 758 */ 759
760 if (!skb_queue_empty(&sk->write_queue) && /* should always be true */ 761 ! before (sk->window_seq, sk->write_queue.next->end_seq))
762 { 763 sk->backoff = 0;
764
765 /* 766 * Recompute rto from rtt. this eliminates any backoff. 767 */ 768
769 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
770 if (sk->rto > 120*HZ)
771 sk->rto = 120*HZ;
772 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about 773 .2 of a second because of BSD delayed acks - on a 100Mb/sec link 774 .2 of a second is going to need huge windows (SIGH) */ 775 sk->rto = HZ/5;
776 } 777 } 778
779 /* 780 * See if we can take anything off of the retransmit queue. 781 */ 782
783 for (;;) { 784 structsk_buff * skb = sk->send_head;
785 if (!skb)
786 break;
787
788 /* Check for a bug. */ 789 if (skb->link3 && after(skb->end_seq, skb->link3->end_seq))
790 printk("INET: tcp.c: *** bug send_list out of order.\n");
791
792 /* 793 * If our packet is before the ack sequence we can 794 * discard it as it's confirmed to have arrived the other end. 795 */ 796
797 if (after(skb->end_seq, ack))
798 break;
799
800 if (sk->retransmits)
801 { 802 /* 803 * We were retransmitting. don't count this in RTT est 804 */ 805 flag |= 2;
806 } 807
808 if ((sk->send_head = skb->link3) == NULL)
809 { 810 sk->send_tail = NULL;
811 sk->retransmits = 0;
812 } 813 /* 814 * Note that we only reset backoff and rto in the 815 * rtt recomputation code. And that doesn't happen 816 * if there were retransmissions in effect. So the 817 * first new packet after the retransmissions is 818 * sent with the backoff still in effect. Not until 819 * we get an ack from a non-retransmitted packet do 820 * we reset the backoff and rto. This allows us to deal 821 * with a situation where the network delay has increased 822 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) 823 */ 824
825 /* 826 * We have one less packet out there. 827 */ 828
829 if (sk->packets_out > 0)
830 sk->packets_out --;
831
832 if (!(flag&2)) /* Not retransmitting */ 833 tcp_rtt_estimator(sk,skb);
834 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 835 In this case as we just set it up */ 836 IS_SKB(skb);
837
838 /* 839 * We may need to remove this from the dev send list. 840 */ 841 cli();
842 if (skb->next)
843 skb_unlink(skb);
844 sti();
845 kfree_skb(skb, FREE_WRITE); /* write. */ 846 if (!sk->dead)
847 sk->write_space(sk);
848 } 849
850 /* 851 * XXX someone ought to look at this too.. at the moment, if skb_peek() 852 * returns non-NULL, we complete ignore the timer stuff in the else 853 * clause. We ought to organize the code so that else clause can 854 * (should) be executed regardless, possibly moving the PROBE timer 855 * reset over. The skb_peek() thing should only move stuff to the 856 * write queue, NOT also manage the timer functions. 857 */ 858
859 /* 860 * Maybe we can take some stuff off of the write queue, 861 * and put it onto the xmit queue. 862 */ 863 if (skb_peek(&sk->write_queue) != NULL)
864 { 865 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
866 (sk->retransmits == 0 ||
867 sk->ip_xmit_timeout != TIME_WRITE ||
868 !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
869 && sk->packets_out < sk->cong_window)
870 { 871 /* 872 * Add more data to the send queue. 873 */ 874 flag |= 1;
875 tcp_write_xmit(sk);
876 } 877 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
878 sk->send_head == NULL &&
879 sk->ack_backlog == 0 &&
880 sk->state != TCP_TIME_WAIT)
881 { 882 /* 883 * Data to queue but no room. 884 */ 885 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
886 } 887 } 888 else 889 { 890 /* 891 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets 892 * from TCP_CLOSE we don't do anything 893 * 894 * from anything else, if there is write data (or fin) pending, 895 * we use a TIME_WRITE timeout, else if keepalive we reset to 896 * a KEEPALIVE timeout, else we delete the timer. 897 * 898 * We do not set flag for nominal write data, otherwise we may 899 * force a state where we start to write itsy bitsy tidbits 900 * of data. 901 */ 902
903 switch(sk->state) { 904 caseTCP_TIME_WAIT:
905 /* 906 * keep us in TIME_WAIT until we stop getting packets, 907 * reset the timeout. 908 */ 909 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
910 break;
911 caseTCP_CLOSE:
912 /* 913 * don't touch the timer. 914 */ 915 break;
916 default:
917 /* 918 * Reset the xmit timer - state has changed. 919 */ 920 tcp_reset_xmit_timer(sk, 0, 0);
921 break;
922 } 923 } 924
925 /* 926 * We have nothing queued but space to send. Send any partial 927 * packets immediately (end of Nagle rule application). 928 */ 929
930 if (sk->packets_out == 0
931 && sk->partial != NULL 932 && skb_queue_empty(&sk->write_queue)
933 && sk->send_head == NULL)
934 { 935 flag |= 1;
936 tcp_send_partial(sk);
937 } 938
939 /* 940 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and 941 * we are now waiting for an acknowledge to our FIN. The other end is 942 * already in TIME_WAIT. 943 * 944 * Move to TCP_CLOSE on success. 945 */ 946
947 if (sk->state == TCP_LAST_ACK)
948 { 949 if (!sk->dead)
950 sk->state_change(sk);
951 if(sk->debug)
952 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
953 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
954 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
955 { 956 flag |= 1;
957 sk->shutdown = SHUTDOWN_MASK;
958 tcp_set_state(sk,TCP_CLOSE);
959 return 1;
960 } 961 } 962
963 /* 964 * Incoming ACK to a FIN we sent in the case of our initiating the close. 965 * 966 * Move to FIN_WAIT2 to await a FIN from the other end. Set 967 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. 968 */ 969
970 if (sk->state == TCP_FIN_WAIT1)
971 { 972
973 if (!sk->dead)
974 sk->state_change(sk);
975 if (sk->rcv_ack_seq == sk->write_seq)
976 { 977 flag |= 1;
978 sk->shutdown |= SEND_SHUTDOWN;
979 tcp_set_state(sk, TCP_FIN_WAIT2);
980 } 981 } 982
983 /* 984 * Incoming ACK to a FIN we sent in the case of a simultaneous close. 985 * 986 * Move to TIME_WAIT 987 */ 988
989 if (sk->state == TCP_CLOSING)
990 { 991
992 if (!sk->dead)
993 sk->state_change(sk);
994 if (sk->rcv_ack_seq == sk->write_seq)
995 { 996 flag |= 1;
997 tcp_time_wait(sk);
998 } 999 }1000
1001 /*1002 * Final ack of a three way shake 1003 */1004
1005 if(sk->state==TCP_SYN_RECV)
1006 {1007 tcp_set_state(sk, TCP_ESTABLISHED);
1008 tcp_options(sk,th);
1009 sk->dummy_th.dest=th->source;
1010 sk->copied_seq = sk->acked_seq;
1011 if(!sk->dead)
1012 sk->state_change(sk);
1013 if(sk->max_window==0)
1014 {1015 sk->max_window=32; /* Sanity check */1016 sk->mss=min(sk->max_window,sk->mtu);
1017 }1018 }1019
1020 /*1021 * I make no guarantees about the first clause in the following1022 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under1023 * what conditions "!flag" would be true. However I think the rest1024 * of the conditions would prevent that from causing any1025 * unnecessary retransmission. 1026 * Clearly if the first packet has expired it should be 1027 * retransmitted. The other alternative, "flag&2 && retransmits", is1028 * harder to explain: You have to look carefully at how and when the1029 * timer is set and with what timeout. The most recent transmission always1030 * sets the timer. So in general if the most recent thing has timed1031 * out, everything before it has as well. So we want to go ahead and1032 * retransmit some more. If we didn't explicitly test for this1033 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"1034 * would not be true. If you look at the pattern of timing, you can1035 * show that rto is increased fast enough that the next packet would1036 * almost never be retransmitted immediately. Then you'd end up1037 * waiting for a timeout to send each packet on the retransmission1038 * queue. With my implementation of the Karn sampling algorithm,1039 * the timeout would double each time. The net result is that it would1040 * take a hideous amount of time to recover from a single dropped packet.1041 * It's possible that there should also be a test for TIME_WRITE, but1042 * I think as long as "send_head != NULL" and "retransmit" is on, we've1043 * got to be in real retransmission mode.1044 * Note that tcp_do_retransmit is called with all==1. Setting cong_window1045 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.1046 * As long as no further losses occur, this seems reasonable.1047 */1048
1049 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1050 (((flag&2) && sk->retransmits) ||
1051 (flag&8) ||
1052 (sk->send_head->when + sk->rto < jiffies)))
1053 {1054 if(sk->send_head->when + sk->rto < jiffies)
1055 tcp_retransmit(sk,0);
1056 else1057 {1058 tcp_do_retransmit(sk, 1);
1059 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1060 }1061 }1062
1063 return 1;
1064
1065 uninteresting_ack:
1066 if(sk->debug)
1067 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1068
1069 /*1070 * Keepalive processing.1071 */1072
1073 if (after(ack, sk->sent_seq))
1074 {1075 return 0;
1076 }1077
1078 /*1079 * Restart the keepalive timer.1080 */1081
1082 if (sk->keepopen)
1083 {1084 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1085 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1086 }1087 return 1;
1088 }1089
1090
1091 /*1092 * Process the FIN bit. This now behaves as it is supposed to work1093 * and the FIN takes effect when it is validly part of sequence1094 * space. Not before when we get holes.1095 *1096 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT1097 * (and thence onto LAST-ACK and finally, CLOSE, we never enter1098 * TIME-WAIT)1099 *1100 * If we are in FINWAIT-1, a received FIN indicates simultaneous1101 * close and we go into CLOSING (and later onto TIME-WAIT)1102 *1103 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.1104 *1105 */1106
1107 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */1108 {1109 sk->fin_seq = skb->end_seq;
1110
1111 if (!sk->dead)
1112 {1113 sk->state_change(sk);
1114 sock_wake_async(sk->socket, 1);
1115 }1116
1117 switch(sk->state)
1118 {1119 caseTCP_SYN_RECV:
1120 caseTCP_SYN_SENT:
1121 caseTCP_ESTABLISHED:
1122 /*1123 * move to CLOSE_WAIT, tcp_data() already handled1124 * sending the ack.1125 */1126 tcp_set_state(sk,TCP_CLOSE_WAIT);
1127 if (th->rst)
1128 sk->shutdown = SHUTDOWN_MASK;
1129 break;
1130
1131 caseTCP_CLOSE_WAIT:
1132 caseTCP_CLOSING:
1133 /*1134 * received a retransmission of the FIN, do1135 * nothing.1136 */1137 break;
1138 caseTCP_TIME_WAIT:
1139 /*1140 * received a retransmission of the FIN,1141 * restart the TIME_WAIT timer.1142 */1143 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1144 return(0);
1145 caseTCP_FIN_WAIT1:
1146 /*1147 * This case occurs when a simultaneous close1148 * happens, we must ack the received FIN and1149 * enter the CLOSING state.1150 *1151 * This causes a WRITE timeout, which will either1152 * move on to TIME_WAIT when we timeout, or resend1153 * the FIN properly (maybe we get rid of that annoying1154 * FIN lost hang). The TIME_WRITE code is already correct1155 * for handling this timeout.1156 */1157
1158 if(sk->ip_xmit_timeout != TIME_WRITE)
1159 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1160 tcp_set_state(sk,TCP_CLOSING);
1161 break;
1162 caseTCP_FIN_WAIT2:
1163 /*1164 * received a FIN -- send ACK and enter TIME_WAIT1165 */1166 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1167 sk->shutdown|=SHUTDOWN_MASK;
1168 tcp_set_state(sk,TCP_TIME_WAIT);
1169 break;
1170 caseTCP_CLOSE:
1171 /*1172 * already in CLOSE1173 */1174 break;
1175 default:
1176 tcp_set_state(sk,TCP_LAST_ACK);
1177
1178 /* Start the timers. */1179 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1180 return(0);
1181 }1182
1183 return(0);
1184 }1185
1186 /*1187 * Add a sk_buff to the TCP receive queue, calculating1188 * the ACK sequence as we go..1189 */1190 staticinlinevoidtcp_insert_skb(structsk_buff * skb, structsk_buff_head * list)
/* */1191 {1192 structsk_buff * prev, * next;
1193 u32seq;
1194
1195 /*1196 * Find where the new skb goes.. (This goes backwards,1197 * on the assumption that we get the packets in order)1198 */1199 seq = skb->seq;
1200 prev = list->prev;
1201 next = (structsk_buff *) list;
1202 for (;;) {1203 if (prev == (structsk_buff *) list || !after(prev->seq, seq))
1204 break;
1205 next = prev;
1206 prev = prev->prev;
1207 }1208 __skb_insert(skb, prev, next, list);
1209 }1210
1211 /*1212 * Called for each packet when we find a new ACK endpoint sequence in it1213 */1214 staticinlineu32tcp_queue_ack(structsk_buff * skb, structsock * sk)
/* */1215 {1216 /*1217 * When we ack the fin, we do the FIN 1218 * processing.1219 */1220 skb->acked = 1;
1221 if (skb->h.th->fin)
1222 tcp_fin(skb,sk,skb->h.th);
1223 returnskb->end_seq;
1224 }1225
1226 staticvoidtcp_queue(structsk_buff * skb, structsock * sk, structtcphdr *th)
/* */1227 {1228 u32ack_seq;
1229
1230 tcp_insert_skb(skb, &sk->receive_queue);
1231
1232 /*1233 * Did we get anything new to ack?1234 */1235 ack_seq = sk->acked_seq;
1236
1237
1238 if (!after(skb->seq, ack_seq)) {1239 if (after(skb->end_seq, ack_seq)) {1240 /* the packet stradles our window end */1241 structsk_buff_head * list = &sk->receive_queue;
1242 structsk_buff * next;
1243 ack_seq = tcp_queue_ack(skb, sk);
1244
1245 /*1246 * Do we have any old packets to ack that the above1247 * made visible? (Go forward from skb)1248 */1249 next = skb->next;
1250 while (next != (structsk_buff *) list) {1251 if (after(next->seq, ack_seq))
1252 break;
1253 if (after(next->end_seq, ack_seq))
1254 ack_seq = tcp_queue_ack(next, sk);
1255 next = next->next;
1256 }1257
1258 /*1259 * Ok, we found new data, update acked_seq as1260 * necessary (and possibly send the actual1261 * ACK packet).1262 */1263 sk->acked_seq = ack_seq;
1264
1265 }else{1266 if (sk->debug)
1267 printk("Ack duplicate packet.\n");
1268 tcp_send_ack(sk);
1269 return;
1270 }1271
1272
1273 /*1274 * Delay the ack if possible. Send ack's to1275 * fin frames immediately as there shouldn't be1276 * anything more to come.1277 */1278 if (!sk->delay_acks || th->fin) {1279 tcp_send_ack(sk);
1280 }else{1281 /*1282 * If psh is set we assume it's an1283 * interactive session that wants quick1284 * acks to avoid nagling too much. 1285 */1286 intdelay = HZ/2;
1287 if (th->psh)
1288 delay = HZ/10;
1289 tcp_send_delayed_ack(sk, delay);
1290 }1291
1292 /*1293 * Tell the user we have some more data.1294 */1295
1296 if (!sk->dead)
1297 sk->data_ready(sk,0);
1298
1299 }1300 else1301 {1302 /*1303 * If we've missed a packet, send an ack.1304 * Also start a timer to send another.1305 *1306 * 4.3reno machines look for these kind of acks so1307 * they can do fast recovery. Three identical 'old'1308 * acks lets it know that one frame has been lost1309 * and should be resent. Because this is before the1310 * whole window of data has timed out it can take1311 * one lost frame per window without stalling.1312 * [See Jacobson RFC1323, Stevens TCP/IP illus vol2]1313 *1314 * We also should be spotting triple bad sequences.1315 * [We now do this.]1316 *1317 */1318
1319 if (!skb->acked)
1320 {1321 if(sk->debug)
1322 printk("Ack past end of seq packet.\n");
1323 tcp_send_ack(sk);
1324 sk->ack_backlog++;
1325 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1326 }1327 }1328 }1329
1330
1331 /*1332 * This routine handles the data. If there is room in the buffer,1333 * it will be have already been moved into it. If there is no1334 * room, then we will just have to discard the packet.1335 */1336
1337 staticinttcp_data(structsk_buff *skb, structsock *sk,
/* */1338 unsignedlongsaddr, unsignedshortlen)
1339 {1340 structtcphdr *th;
1341 u32new_seq, shut_seq;
1342
1343 th = skb->h.th;
1344 skb_pull(skb,th->doff*4);
1345 skb_trim(skb,len-(th->doff*4));
1346
1347 /*1348 * The bytes in the receive read/assembly queue has increased. Needed for the1349 * low memory discard algorithm 1350 */1351
1352 sk->bytes_rcv += skb->len;
1353
1354 if (skb->len == 0 && !th->fin)
1355 {1356 /* 1357 * Don't want to keep passing ack's back and forth. 1358 * (someone sent us dataless, boring frame)1359 */1360 if (!th->ack)
1361 tcp_send_ack(sk);
1362 kfree_skb(skb, FREE_READ);
1363 return(0);
1364 }1365
1366 /*1367 * We no longer have anyone receiving data on this connection.1368 */1369
1370 #ifndef TCP_DONT_RST_SHUTDOWN
1371
1372 if(sk->shutdown & RCV_SHUTDOWN)
1373 {1374 /*1375 * FIXME: BSD has some magic to avoid sending resets to1376 * broken 4.2 BSD keepalives. Much to my surprise a few non1377 * BSD stacks still have broken keepalives so we want to1378 * cope with it.1379 */1380
1381 if(skb->len) /* We don't care if it's just an ack or1382 a keepalive/window probe */1383 {1384 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */1385
1386 /* Do this the way 4.4BSD treats it. Not what I'd1387 regard as the meaning of the spec but it's what BSD1388 does and clearly they know everything 8) */1389
1390 /*1391 * This is valid because of two things1392 *1393 * a) The way tcp_data behaves at the bottom.1394 * b) A fin takes effect when read not when received.1395 */1396
1397 shut_seq = sk->acked_seq+1; /* Last byte */1398
1399 if(after(new_seq,shut_seq))
1400 {1401 if(sk->debug)
1402 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1403 sk, new_seq, shut_seq, sk->blog);
1404 if(sk->dead)
1405 {1406 sk->acked_seq = new_seq + th->fin;
1407 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1408 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1409 tcp_statistics.TcpEstabResets++;
1410 sk->err = EPIPE;
1411 sk->error_report(sk);
1412 sk->shutdown = SHUTDOWN_MASK;
1413 tcp_set_state(sk,TCP_CLOSE);
1414 kfree_skb(skb, FREE_READ);
1415 return 0;
1416 }1417 }1418 }1419 }1420
1421 #endif1422
1423 tcp_queue(skb, sk, th);
1424
1425 return(0);
1426 }1427
1428
1429 /*1430 * This routine is only called when we have urgent data1431 * signalled. Its the 'slow' part of tcp_urg. It could be1432 * moved inline now as tcp_urg is only called from one1433 * place. We handle URGent data wrong. We have to - as1434 * BSD still doesn't use the correction from RFC961.1435 */1436
1437 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */1438 {1439 u32ptr = ntohs(th->urg_ptr);
1440
1441 if (ptr)
1442 ptr--;
1443 ptr += ntohl(th->seq);
1444
1445 /* ignore urgent data that we've already seen and read */1446 if (after(sk->copied_seq, ptr))
1447 return;
1448
1449 /* do we already have a newer (or duplicate) urgent pointer? */1450 if (sk->urg_data && !after(ptr, sk->urg_seq))
1451 return;
1452
1453 /* tell the world about our new urgent pointer */1454 if (sk->proc != 0) {1455 if (sk->proc > 0) {1456 kill_proc(sk->proc, SIGURG, 1);
1457 }else{1458 kill_pg(-sk->proc, SIGURG, 1);
1459 }1460 }1461 sk->urg_data = URG_NOTYET;
1462 sk->urg_seq = ptr;
1463 }1464
1465 /*1466 * This is the 'fast' part of urgent handling.1467 */1468
1469 staticinlinevoidtcp_urg(structsock *sk, structtcphdr *th, unsignedlonglen)
/* */1470 {1471 /*1472 * Check if we get a new urgent pointer - normally not 1473 */1474
1475 if (th->urg)
1476 tcp_check_urg(sk,th);
1477
1478 /*1479 * Do we wait for any urgent data? - normally not1480 */1481
1482 if (sk->urg_data == URG_NOTYET) {1483 u32ptr;
1484
1485 /*1486 * Is the urgent pointer pointing into this packet? 1487 */1488 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1489 if (ptr < len) {1490 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
1491 if (!sk->dead)
1492 sk->data_ready(sk,0);
1493 }1494 }1495 }1496
1497 /*1498 * This should be a bit smarter and remove partially1499 * overlapping stuff too, but this should be good1500 * enough for any even remotely normal case (and the1501 * worst that can happen is that we have a few1502 * unnecessary packets in the receive queue).1503 *1504 * This function is never called with an empty list..1505 */1506 staticinlinevoidtcp_remove_dups(structsk_buff_head * list)
/* */1507 {1508 structsk_buff * next = list->next;
1509
1510 for (;;) {1511 structsk_buff * skb = next;
1512 next = next->next;
1513 if (next == (structsk_buff *) list)
1514 break;
1515 if (before(next->end_seq, skb->end_seq)) {1516 __skb_unlink(next, list);
1517 kfree_skb(next, FREE_READ);
1518 next = skb;
1519 continue;
1520 }1521 if (next->seq != skb->seq)
1522 continue;
1523 __skb_unlink(skb, list);
1524 kfree_skb(skb, FREE_READ);
1525 }1526 }1527
1528 /*1529 * Throw out all unnecessary packets: we've gone over the1530 * receive queue limit. This shouldn't happen in a normal1531 * TCP connection, but we might have gotten duplicates etc.1532 */1533 staticvoidprune_queue(structsk_buff_head * list)
/* */1534 {1535 for (;;) {1536 structsk_buff * skb = list->prev;
1537
1538 /* gone through it all? */1539 if (skb == (structsk_buff *) list)
1540 break;
1541 if (!skb->acked) {1542 __skb_unlink(skb, list);
1543 kfree_skb(skb, FREE_READ);
1544 continue;
1545 }1546 tcp_remove_dups(list);
1547 break;
1548 }1549 }1550
1551 /*1552 * A TCP packet has arrived.1553 * skb->h.raw is the TCP header.1554 */1555
1556 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */1557 __u32daddr, unsignedshortlen,
1558 __u32saddr, intredo, structinet_protocol * protocol)
1559 {1560 structtcphdr *th;
1561 structsock *sk;
1562 intsyn_ok=0;
1563
1564 /*1565 * "redo" is 1 if we have already seen this skb but couldn't1566 * use it at that time (the socket was locked). In that case1567 * we have already done a lot of the work (looked up the socket1568 * etc).1569 */1570 th = skb->h.th;
1571 sk = skb->sk;
1572 if (!redo) {1573 tcp_statistics.TcpInSegs++;
1574 if (skb->pkt_type!=PACKET_HOST)
1575 gotodiscard_it;
1576
1577 /*1578 * Pull up the IP header.1579 */1580
1581 skb_pull(skb, skb->h.raw-skb->data);
1582
1583 /*1584 * Try to use the device checksum if provided.1585 */1586 switch (skb->ip_summed)
1587 {1588 caseCHECKSUM_NONE:
1589 skb->csum = csum_partial((char *)th, len, 0);
1590 caseCHECKSUM_HW:
1591 if (tcp_check(th, len, saddr, daddr, skb->csum))
1592 gotodiscard_it;
1593 default:
1594 /* CHECKSUM_UNNECESSARY */1595 }1596 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1597 if (!sk)
1598 gotono_tcp_socket;
1599 skb->sk = sk;
1600 skb->seq = ntohl(th->seq);
1601 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1602 skb->ack_seq = ntohl(th->ack_seq);
1603
1604 skb->acked = 0;
1605 skb->used = 0;
1606 skb->free = 1;
1607 skb->saddr = daddr;
1608 skb->daddr = saddr;
1609
1610 /*1611 * We may need to add it to the backlog here. 1612 */1613 if (sk->users)
1614 {1615 __skb_queue_tail(&sk->back_log, skb);
1616 return(0);
1617 }1618 }1619
1620 /*1621 * If this socket has got a reset it's to all intents and purposes 1622 * really dead. Count closed sockets as dead.1623 *1624 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD1625 * simply drops data. This seems incorrect as a 'closed' TCP doesn't1626 * exist so should cause resets as if the port was unreachable.1627 */1628
1629 if (sk->zapped || sk->state==TCP_CLOSE)
1630 gotono_tcp_socket;
1631
1632 if (!sk->prot)
1633 {1634 printk("IMPOSSIBLE 3\n");
1635 return(0);
1636 }1637
1638
1639 /*1640 * Charge the memory to the socket. 1641 */1642
1643 skb->sk=sk;
1644 atomic_add(skb->truesize, &sk->rmem_alloc);
1645
1646 /*1647 * We should now do header prediction.1648 */1649
1650 /*1651 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We1652 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug1653 * compatibility. We also set up variables more thoroughly [Karn notes in the1654 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].1655 */1656
1657 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */1658 {1659
1660 /*1661 * Now deal with unusual cases.1662 */1663
1664 if(sk->state==TCP_LISTEN)
1665 {1666 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */1667 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1668
1669 /*1670 * We don't care for RST, and non SYN are absorbed (old segments)1671 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the1672 * netmask on a running connection it can go broadcast. Even Sun's have1673 * this problem so I'm ignoring it 1674 */1675
1676 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1677 {1678 kfree_skb(skb, FREE_READ);
1679 return 0;
1680 }1681
1682 /* 1683 * Guess we need to make a new socket up 1684 */1685
1686 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1687
1688 /*1689 * Now we have several options: In theory there is nothing else1690 * in the frame. KA9Q has an option to send data with the syn,1691 * BSD accepts data with the syn up to the [to be] advertised window1692 * and Solaris 2.1 gives you a protocol error. For now we just ignore1693 * it, that fits the spec precisely and avoids incompatibilities. It1694 * would be nice in future to drop through and process the data.1695 *1696 * Now TTCP is starting to use we ought to queue this data.1697 */1698
1699 return 0;
1700 }1701
1702 /* 1703 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN1704 * then its a new connection1705 */1706
1707 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1708 {1709 kfree_skb(skb, FREE_READ);
1710 return 0;
1711 }1712
1713 /*1714 * SYN sent means we have to look for a suitable ack and either reset1715 * for bad matches or go to connected. The SYN_SENT case is unusual and should1716 * not be in line code. [AC]1717 */1718
1719 if(sk->state==TCP_SYN_SENT)
1720 {1721 /* Crossed SYN or previous junk segment */1722 if(th->ack)
1723 {1724 /* We got an ack, but it's not a good ack */1725 if(!tcp_ack(sk,th,skb->ack_seq,len))
1726 {1727 /* Reset the ack - its an ack from a 1728 different connection [ th->rst is checked in tcp_send_reset()] */1729 tcp_statistics.TcpAttemptFails++;
1730 tcp_send_reset(daddr, saddr, th,
1731 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1732 kfree_skb(skb, FREE_READ);
1733 return(0);
1734 }1735 if(th->rst)
1736 returntcp_reset(sk,skb);
1737 if(!th->syn)
1738 {1739 /* A valid ack from a different connection1740 start. Shouldn't happen but cover it */1741 tcp_statistics.TcpAttemptFails++;
1742 tcp_send_reset(daddr, saddr, th,
1743 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1744 kfree_skb(skb, FREE_READ);
1745 return 0;
1746 }1747 /*1748 * Ok.. it's good. Set up sequence numbers and1749 * move to established.1750 */1751 syn_ok=1; /* Don't reset this connection for the syn */1752 sk->acked_seq = skb->seq+1;
1753 sk->lastwin_seq = skb->seq+1;
1754 sk->fin_seq = skb->seq;
1755 tcp_send_ack(sk);
1756 tcp_set_state(sk, TCP_ESTABLISHED);
1757 tcp_options(sk,th);
1758 sk->dummy_th.dest=th->source;
1759 sk->copied_seq = sk->acked_seq;
1760 if(!sk->dead)
1761 {1762 sk->state_change(sk);
1763 sock_wake_async(sk->socket, 0);
1764 }1765 if(sk->max_window==0)
1766 {1767 sk->max_window = 32;
1768 sk->mss = min(sk->max_window, sk->mtu);
1769 }1770 }1771 else1772 {1773 /* See if SYN's cross. Drop if boring */1774 if(th->syn && !th->rst)
1775 {1776 /* Crossed SYN's are fine - but talking to1777 yourself is right out... */1778 if(sk->saddr==saddr && sk->daddr==daddr &&
1779 sk->dummy_th.source==th->source &&
1780 sk->dummy_th.dest==th->dest)
1781 {1782 tcp_statistics.TcpAttemptFails++;
1783 returntcp_reset(sk,skb);
1784 }1785 tcp_set_state(sk,TCP_SYN_RECV);
1786
1787 /*1788 * FIXME:1789 * Must send SYN|ACK here1790 */1791 }1792 /* Discard junk segment */1793 kfree_skb(skb, FREE_READ);
1794 return 0;
1795 }1796 /*1797 * SYN_RECV with data maybe.. drop through1798 */1799 gotorfc_step6;
1800 }1801
1802 /*1803 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is1804 * a more complex suggestion for fixing these reuse issues in RFC16441805 * but not yet ready for general use. Also see RFC1379.1806 *1807 * Note the funny way we go back to the top of this function for1808 * this case ("goto try_next_socket"). That also takes care of1809 * checking "sk->users" for the new socket as well as doing all1810 * the normal tests on the packet.1811 */1812
1813 #defineBSD_TIME_WAIT1814 #ifdefBSD_TIME_WAIT1815 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1816 after(skb->seq, sk->acked_seq) && !th->rst)
1817 {1818 u32seq = sk->write_seq;
1819 if(sk->debug)
1820 printk("Doing a BSD time wait\n");
1821 tcp_statistics.TcpEstabResets++;
1822 atomic_sub(skb->truesize, &sk->rmem_alloc);
1823 skb->sk = NULL;
1824 sk->err=ECONNRESET;
1825 tcp_set_state(sk, TCP_CLOSE);
1826 sk->shutdown = SHUTDOWN_MASK;
1827 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1828 /* this is not really correct: we should check sk->users */1829 if (sk && sk->state==TCP_LISTEN)
1830 {1831 skb->sk = sk;
1832 atomic_add(skb->truesize, &sk->rmem_alloc);
1833 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1834 return 0;
1835 }1836 kfree_skb(skb, FREE_READ);
1837 return 0;
1838 }1839 #endif1840 }1841
1842 /*1843 * We are now in normal data flow (see the step list in the RFC)1844 * Note most of these are inline now. I'll inline the lot when1845 * I have time to test it hard and look at what gcc outputs 1846 */1847
1848 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1849 {1850 bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
1851 kfree_skb(skb, FREE_READ);
1852 return 0;
1853 }1854
1855 if(th->rst)
1856 returntcp_reset(sk,skb);
1857
1858 /*1859 * !syn_ok is effectively the state test in RFC793.1860 */1861
1862 if(th->syn && !syn_ok)
1863 {1864 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1865 returntcp_reset(sk,skb);
1866 }1867
1868 tcp_delack_estimator(sk);
1869
1870 /*1871 * Process the ACK1872 */1873
1874
1875 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1876 {1877 /*1878 * Our three way handshake failed.1879 */1880
1881 if(sk->state==TCP_SYN_RECV)
1882 {1883 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1884 }1885 kfree_skb(skb, FREE_READ);
1886 return 0;
1887 }1888
1889 rfc_step6: /* I'll clean this up later */1890
1891 /*1892 * If the accepted buffer put us over our queue size we1893 * now drop it (we must process the ack first to avoid1894 * deadlock cases).1895 */1896
1897 /*1898 * Process urgent data1899 */1900
1901 tcp_urg(sk, th, len);
1902
1903 /*1904 * Process the encapsulated data1905 */1906
1907 if(tcp_data(skb,sk, saddr, len))
1908 kfree_skb(skb, FREE_READ);
1909
1910 /*1911 * If our receive queue has grown past its limits,1912 * try to prune away duplicates etc..1913 */1914 if (sk->rmem_alloc > sk->rcvbuf)
1915 prune_queue(&sk->receive_queue);
1916
1917 /*1918 * And done1919 */1920
1921 return 0;
1922
1923 no_tcp_socket:
1924 /*1925 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)1926 */1927 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1928
1929 discard_it:
1930 /*1931 * Discard frame1932 */1933 skb->sk = NULL;
1934 kfree_skb(skb, FREE_READ);
1935 return 0;
1936 }