1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp_input.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * FIXES 23 * Pedro Roque : Double ACK bug 24 */ 25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 /* 30 * Policy code extracted so its now separate 31 */ 32
33 /* 34 * Called each time to estimate the delayed ack timeout. This is 35 * how it should be done so a fast link isn't impacted by ack delay. 36 */ 37
38 extern__inline__voidtcp_delack_estimator(structsock *sk)
/* */ 39 { 40 /* 41 * Delayed ACK time estimator. 42 */ 43
44 if (sk->lrcvtime == 0)
45 { 46 sk->lrcvtime = jiffies;
47 sk->ato = HZ/3;
48 } 49 else 50 { 51 intm;
52
53 m = jiffies - sk->lrcvtime;
54
55 sk->lrcvtime = jiffies;
56
57 if (m <= 0)
58 m = 1;
59
60 if (m > (sk->rtt >> 3))
61 { 62 sk->ato = sk->rtt >> 3;
63 /* 64 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); 65 */ 66 } 67 else 68 { 69 sk->ato = (sk->ato >> 1) + m;
70 /* 71 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); 72 */ 73 } 74 } 75 } 76
77 /* 78 * Called on frames that were known _not_ to have been 79 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 80 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. 81 */ 82
83 extern__inline__voidtcp_rtt_estimator(structsock *sk, structsk_buff *oskb)
/* */ 84 { 85 longm;
86 /* 87 * The following amusing code comes from Jacobson's 88 * article in SIGCOMM '88. Note that rtt and mdev 89 * are scaled versions of rtt and mean deviation. 90 * This is designed to be as fast as possible 91 * m stands for "measurement". 92 */ 93
94 m = jiffies - oskb->when; /* RTT */ 95 if(m<=0)
96 m=1; /* IS THIS RIGHT FOR <0 ??? */ 97 m -= (sk->rtt >> 3); /* m is now error in rtt est */ 98 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ 99 if (m < 0)
100 m = -m; /* m is now abs(error) */ 101 m -= (sk->mdev >> 2); /* similar update on mdev */ 102 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 103
104 /* 105 * Now update timeout. Note that this removes any backoff. 106 */ 107
108 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
109 if (sk->rto > 120*HZ)
110 sk->rto = 120*HZ;
111 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ 112 sk->rto = HZ/5;
113 sk->backoff = 0;
114 } 115
116 /* 117 * Cached last hit socket 118 */ 119
120 staticvolatileunsignedlongth_cache_saddr, th_cache_daddr;
121 staticvolatileunsignedshortth_cache_dport, th_cache_sport;
122 staticvolatilestructsock *th_cache_sk;
123
124 voidtcp_cache_zap(void)
/* */ 125 { 126 th_cache_sk=NULL;
127 } 128
129 /* 130 * Find the socket, using the last hit cache if applicable. The cache is not quite 131 * right... 132 */ 133
134 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */ 135 { 136 structsock * sk;
137
138 sk = (structsock *) th_cache_sk;
139 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
140 sport != th_cache_sport || dport != th_cache_dport) { 141 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
142 if (sk) { 143 th_cache_saddr=saddr;
144 th_cache_daddr=daddr;
145 th_cache_dport=dport;
146 th_cache_sport=sport;
147 th_cache_sk=sk;
148 } 149 } 150 returnsk;
151 } 152
153 /* 154 * React to a out-of-window TCP sequence number in an incoming packet 155 */ 156
157 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, u32end_seq,
/* */ 158 structdevice *dev)
159 { 160 if (th->rst)
161 return;
162
163 /* 164 * Send a reset if we get something not ours and we are 165 * unsynchronized. Note: We don't do anything to our end. We 166 * are just killing the bogus remote connection then we will 167 * connect again and it will work (with luck). 168 */ 169
170 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
171 { 172 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
173 return;
174 } 175
176 /* 177 * 4.3reno machines look for these kind of acks so they can do fast 178 * recovery. Three identical 'old' acks lets it know that one frame has 179 * been lost and should be resent. Because this is before the whole window 180 * of data has timed out it can take one lost frame per window without 181 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2] 182 */ 183 tcp_send_ack(sk);
184 } 185
186 /* 187 * This functions checks to see if the tcp header is actually acceptable. 188 */ 189
190 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */ 191 { 192 u32end_window = sk->acked_seq + sk->window;
193 return/* if start is at end of window, end must be too (zero window) */ 194 (seq == end_window && seq == end_seq) ||
195 /* if start is before end of window, check for interest */ 196 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
197 } 198
199 /* 200 * When we get a reset we do this. This probably is a tcp_output routine 201 * really. 202 */ 203
204 staticinttcp_reset(structsock *sk, structsk_buff *skb)
/* */ 205 { 206 sk->zapped = 1;
207 /* 208 * We want the right error as BSD sees it (and indeed as we do). 209 */ 210 sk->err = ECONNRESET;
211 if (sk->state == TCP_SYN_SENT)
212 sk->err = ECONNREFUSED;
213 if (sk->state == TCP_CLOSE_WAIT)
214 sk->err = EPIPE;
215 #ifdef CONFIG_TCP_RFC1337
216 /* 217 * Time wait assassination protection [RFC1337] 218 * 219 * This is a good idea, but causes more sockets to take time to close. 220 * 221 * Ian Heavens has since shown this is an inadequate fix for the protocol 222 * bug in question. 223 */ 224 if(sk->state!=TCP_TIME_WAIT)
225 { 226 tcp_set_state(sk,TCP_CLOSE);
227 sk->shutdown = SHUTDOWN_MASK;
228 } 229 #else 230 tcp_set_state(sk,TCP_CLOSE);
231 sk->shutdown = SHUTDOWN_MASK;
232 #endif 233 if (!sk->dead)
234 sk->state_change(sk);
235 kfree_skb(skb, FREE_READ);
236 return(0);
237 } 238
239
240 /* 241 * Look for tcp options. Parses everything but only knows about MSS. 242 * This routine is always called with the packet containing the SYN. 243 * However it may also be called with the ack to the SYN. So you 244 * can't assume this is always the SYN. It's always called after 245 * we have set up sk->mtu to our own MTU. 246 * 247 * We need at minimum to add PAWS support here. Possibly large windows 248 * as Linux gets deployed on 100Mb/sec networks. 249 */ 250
251 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */ 252 { 253 unsignedchar *ptr;
254 intlength=(th->doff*4)-sizeof(structtcphdr);
255 intmss_seen = 0;
256
257 ptr = (unsignedchar *)(th + 1);
258
259 while(length>0)
260 { 261 intopcode=*ptr++;
262 intopsize=*ptr++;
263 switch(opcode)
264 { 265 caseTCPOPT_EOL:
266 return;
267 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 268 length--;
269 ptr--; /* the opsize=*ptr++ above was a mistake */ 270 continue;
271
272 default:
273 if(opsize<=2) /* Avoid silly options looping forever */ 274 return;
275 switch(opcode)
276 { 277 caseTCPOPT_MSS:
278 if(opsize==4 && th->syn)
279 { 280 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
281 mss_seen = 1;
282 } 283 break;
284 /* Add other options here as people feel the urge to implement stuff like large windows */ 285 } 286 ptr+=opsize-2;
287 length-=opsize;
288 } 289 } 290 if (th->syn)
291 { 292 if (! mss_seen)
293 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ 294 } 295 #ifdefCONFIG_INET_PCTCP 296 sk->mss = min(sk->max_window >> 1, sk->mtu);
297 #else 298 sk->mss = min(sk->max_window, sk->mtu);
299 sk->max_unacked = 2 * sk->mss;
300 #endif 301 } 302
303
304 /* 305 * This routine handles a connection request. 306 * It should make sure we haven't already responded. 307 * Because of the way BSD works, we have to send a syn/ack now. 308 * This also means it will be harder to close a socket which is 309 * listening. 310 */ 311
312 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */ 313 u32daddr, u32saddr, structoptions *opt, structdevice *dev, u32seq)
314 { 315 structsock *newsk;
316 structtcphdr *th;
317 structrtable *rt;
318
319 th = skb->h.th;
320
321 /* If the socket is dead, don't accept the connection. */ 322 if (!sk->dead)
323 { 324 sk->data_ready(sk,0);
325 } 326 else 327 { 328 if(sk->debug)
329 printk("Reset on %p: Connect on dead socket.\n",sk);
330 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
331 tcp_statistics.TcpAttemptFails++;
332 kfree_skb(skb, FREE_READ);
333 return;
334 } 335
336 /* 337 * Make sure we can accept more. This will prevent a 338 * flurry of syns from eating up all our memory. 339 * 340 * BSD does some funnies here and allows 3/2 times the 341 * set backlog as a fudge factor. Thats just too gross. 342 */ 343
344 if (sk->ack_backlog >= sk->max_ack_backlog)
345 { 346 tcp_statistics.TcpAttemptFails++;
347 kfree_skb(skb, FREE_READ);
348 return;
349 } 350
351 /* 352 * We need to build a new sock struct. 353 * It is sort of bad to have a socket without an inode attached 354 * to it, but the wake_up's will just wake up the listening socket, 355 * and if the listening socket is destroyed before this is taken 356 * off of the queue, this will take care of it. 357 */ 358
359 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
360 if (newsk == NULL)
361 { 362 /* just ignore the syn. It will get retransmitted. */ 363 tcp_statistics.TcpAttemptFails++;
364 kfree_skb(skb, FREE_READ);
365 return;
366 } 367
368 memcpy(newsk, sk, sizeof(*newsk));
369 newsk->opt = NULL;
370 newsk->ip_route_cache = NULL;
371 if (opt && opt->optlen)
372 { 373 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
374 if (!sk->opt)
375 { 376 kfree_s(newsk, sizeof(structsock));
377 tcp_statistics.TcpAttemptFails++;
378 kfree_skb(skb, FREE_READ);
379 return;
380 } 381 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
382 { 383 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
384 kfree_s(newsk, sizeof(structsock));
385 tcp_statistics.TcpAttemptFails++;
386 kfree_skb(skb, FREE_READ);
387 return;
388 } 389 } 390 skb_queue_head_init(&newsk->write_queue);
391 skb_queue_head_init(&newsk->receive_queue);
392 newsk->send_head = NULL;
393 newsk->send_tail = NULL;
394 skb_queue_head_init(&newsk->back_log);
395 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ 396 newsk->rto = TCP_TIMEOUT_INIT;
397 newsk->mdev = 0;
398 newsk->max_window = 0;
399 newsk->cong_window = 1;
400 newsk->cong_count = 0;
401 newsk->ssthresh = 0;
402 newsk->backoff = 0;
403 newsk->blog = 0;
404 newsk->intr = 0;
405 newsk->proc = 0;
406 newsk->done = 0;
407 newsk->partial = NULL;
408 newsk->pair = NULL;
409 newsk->wmem_alloc = 0;
410 newsk->rmem_alloc = 0;
411 newsk->localroute = sk->localroute;
412
413 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
414
415 newsk->err = 0;
416 newsk->shutdown = 0;
417 newsk->ack_backlog = 0;
418 newsk->acked_seq = skb->seq+1;
419 newsk->lastwin_seq = skb->seq+1;
420 newsk->delay_acks = 1;
421 newsk->copied_seq = skb->seq+1;
422 newsk->fin_seq = skb->seq;
423 newsk->state = TCP_SYN_RECV;
424 newsk->timeout = 0;
425 newsk->ip_xmit_timeout = 0;
426 newsk->write_seq = seq;
427 newsk->window_seq = newsk->write_seq;
428 newsk->rcv_ack_seq = newsk->write_seq;
429 newsk->urg_data = 0;
430 newsk->retransmits = 0;
431 newsk->linger=0;
432 newsk->destroy = 0;
433 init_timer(&newsk->timer);
434 newsk->timer.data = (unsignedlong)newsk;
435 newsk->timer.function = &net_timer;
436 init_timer(&newsk->delack_timer);
437 newsk->delack_timer.data = (unsignedlong)newsk;
438 newsk->delack_timer.function = tcp_delack_timer;
439 init_timer(&newsk->retransmit_timer);
440 newsk->retransmit_timer.data = (unsignedlong)newsk;
441 newsk->retransmit_timer.function = tcp_retransmit_timer;
442 newsk->dummy_th.source = skb->h.th->dest;
443 newsk->dummy_th.dest = skb->h.th->source;
444
445 /* 446 * Swap these two, they are from our point of view. 447 */ 448
449 newsk->daddr = saddr;
450 newsk->saddr = daddr;
451 newsk->rcv_saddr = daddr;
452
453 put_sock(newsk->num,newsk);
454 newsk->acked_seq = skb->seq + 1;
455 newsk->copied_seq = skb->seq + 1;
456 newsk->socket = NULL;
457
458 /* 459 * Grab the ttl and tos values and use them 460 */ 461
462 newsk->ip_ttl=sk->ip_ttl;
463 newsk->ip_tos=skb->ip_hdr->tos;
464
465 /* 466 * Use 512 or whatever user asked for 467 */ 468
469 /* 470 * Note use of sk->user_mss, since user has no direct access to newsk 471 */ 472
473 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
474 newsk->ip_route_cache = rt;
475
476 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
477 newsk->window_clamp = rt->rt_window;
478 else 479 newsk->window_clamp = 0;
480
481 if (sk->user_mss)
482 newsk->mtu = sk->user_mss;
483 elseif (rt)
484 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
485 else 486 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
487
488 /* 489 * But not bigger than device MTU 490 */ 491
492 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
493
494 #ifdefCONFIG_SKIP 495
496 /* 497 * SKIP devices set their MTU to 65535. This is so they can take packets 498 * unfragmented to security process then fragment. They could lie to the 499 * TCP layer about a suitable MTU, but its easier to let skip sort it out 500 * simply because the final package we want unfragmented is going to be 501 * 502 * [IPHDR][IPSP][Security data][Modified TCP data][Security data] 503 */ 504
505 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ 506 sk->mtu=skip_pick_mtu(sk->mtu,dev);
507 #endif 508 /* 509 * This will min with what arrived in the packet 510 */ 511
512 tcp_options(newsk,skb->h.th);
513
514 tcp_cache_zap();
515 tcp_send_synack(newsk, sk, skb);
516 } 517
518
519 /* 520 * Handle a TCP window that shrunk on us. It shouldn't happen, 521 * but.. 522 * 523 * We may need to move packets from the send queue 524 * to the write queue, if the window has been shrunk on us. 525 * The RFC says you are not allowed to shrink your window 526 * like this, but if the other end does, you must be able 527 * to deal with it. 528 */ 529 voidtcp_window_shrunk(structsock * sk, u32window_seq)
/* */ 530 { 531 structsk_buff *skb;
532 structsk_buff *skb2;
533 structsk_buff *wskb = NULL;
534
535 skb2 = sk->send_head;
536 sk->send_head = NULL;
537 sk->send_tail = NULL;
538
539 /* 540 * This is an artifact of a flawed concept. We want one 541 * queue and a smarter send routine when we send all. 542 */ 543 cli();
544 while (skb2 != NULL)
545 { 546 skb = skb2;
547 skb2 = skb->link3;
548 skb->link3 = NULL;
549 if (after(skb->end_seq, window_seq))
550 { 551 if (sk->packets_out > 0)
552 sk->packets_out--;
553 /* We may need to remove this from the dev send list. */ 554 if (skb->next != NULL)
555 { 556 skb_unlink(skb);
557 } 558 /* Now add it to the write_queue. */ 559 if (wskb == NULL)
560 skb_queue_head(&sk->write_queue,skb);
561 else 562 skb_append(wskb,skb);
563 wskb = skb;
564 } 565 else 566 { 567 if (sk->send_head == NULL)
568 { 569 sk->send_head = skb;
570 sk->send_tail = skb;
571 } 572 else 573 { 574 sk->send_tail->link3 = skb;
575 sk->send_tail = skb;
576 } 577 skb->link3 = NULL;
578 } 579 } 580 sti();
581 } 582
583
584 /* 585 * This routine deals with incoming acks, but not outgoing ones. 586 * 587 * This routine is totally _WRONG_. The list structuring is wrong, 588 * the algorithm is wrong, the code is wrong. 589 */ 590
591 staticinttcp_ack(structsock *sk, structtcphdr *th, u32ack, intlen)
/* */ 592 { 593 intflag = 0;
594 u32window_seq;
595
596 /* 597 * 1 - there was data in packet as well as ack or new data is sent or 598 * in shutdown state 599 * 2 - data from retransmit queue was acked and removed 600 * 4 - window shrunk or data from retransmit queue was acked and removed 601 * 8 - we want to do a fast retransmit. One packet only. 602 */ 603
604 if(sk->zapped)
605 return(1); /* Dead, cant ack any more so why bother */ 606
607 /* 608 * We have dropped back to keepalive timeouts. Thus we have 609 * no retransmits pending. 610 */ 611
612 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
613 sk->retransmits = 0;
614
615 /* 616 * If the ack is newer than sent or older than previous acks 617 * then we can probably ignore it. 618 */ 619
620 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
621 gotouninteresting_ack;
622
623 /* 624 * If there is data set flag 1 625 */ 626
627 if (len != th->doff*4)
628 flag |= 1;
629
630 /* 631 * Have we discovered a larger window 632 */ 633 window_seq = ntohs(th->window);
634 if (window_seq > sk->max_window)
635 { 636 sk->max_window = window_seq;
637 #ifdefCONFIG_INET_PCTCP 638 /* Hack because we don't send partial packets to non SWS 639 handling hosts */ 640 sk->mss = min(window_seq>>1, sk->mtu);
641 #else 642 sk->mss = min(window_seq, sk->mtu);
643 #endif 644 } 645 window_seq += ack;
646
647 /* 648 * See if our window has been shrunk. 649 */ 650 if (after(sk->window_seq, window_seq)) { 651 flag |= 4;
652 tcp_window_shrunk(sk, window_seq);
653 } 654
655 /* 656 * Pipe has emptied 657 */ 658 if (sk->send_tail == NULL || sk->send_head == NULL)
659 { 660 sk->send_head = NULL;
661 sk->send_tail = NULL;
662 sk->packets_out= 0;
663 } 664
665 /* 666 * We don't want too many packets out there. 667 */ 668
669 if (sk->ip_xmit_timeout == TIME_WRITE &&
670 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
671 { 672
673 /* 674 * This is Jacobson's slow start and congestion avoidance. 675 * SIGCOMM '88, p. 328. Because we keep cong_window in integral 676 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 677 * counter and increment it once every cwnd times. It's possible 678 * that this should be done only if sk->retransmits == 0. I'm 679 * interpreting "new data is acked" as including data that has 680 * been retransmitted but is just now being acked. 681 */ 682 if (sk->cong_window < sk->ssthresh)
683 /* 684 * In "safe" area, increase 685 */ 686 sk->cong_window++;
687 else 688 { 689 /* 690 * In dangerous area, increase slowly. In theory this is 691 * sk->cong_window += 1 / sk->cong_window 692 */ 693 if (sk->cong_count >= sk->cong_window)
694 { 695 sk->cong_window++;
696 sk->cong_count = 0;
697 } 698 else 699 sk->cong_count++;
700 } 701 } 702
703 /* 704 * Remember the highest ack received and update the 705 * right hand window edge of the host. 706 * We do a bit of work here to track number of times we've 707 * seen this ack without a change in the right edge of the 708 * window. This will allow us to do fast retransmits. 709 */ 710
711 if (sk->rcv_ack_seq == ack && sk->window_seq == window_seq)
712 { 713 /* 714 * We only want to short cut this once, many 715 * ACKs may still come, we'll do a normal transmit 716 * for these ACKs. 717 */ 718 if (++sk->rcv_ack_cnt == MAX_DUP_ACKS+1)
719 flag |= 8; /* flag for a fast retransmit */ 720 } 721 else 722 { 723 sk->window_seq = window_seq;
724 sk->rcv_ack_seq = ack;
725 sk->rcv_ack_cnt = 1;
726 } 727
728 /* 729 * We passed data and got it acked, remove any soft error 730 * log. Something worked... 731 */ 732
733 sk->err_soft = 0;
734
735 /* 736 * If this ack opens up a zero window, clear backoff. It was 737 * being used to time the probes, and is probably far higher than 738 * it needs to be for normal retransmission. 739 */ 740
741 if (sk->ip_xmit_timeout == TIME_PROBE0)
742 { 743 sk->retransmits = 0; /* Our probe was answered */ 744
745 /* 746 * Was it a usable window open ? 747 */ 748
749 if (!skb_queue_empty(&sk->write_queue) && /* should always be true */ 750 ! before (sk->window_seq, sk->write_queue.next->end_seq))
751 { 752 sk->backoff = 0;
753
754 /* 755 * Recompute rto from rtt. this eliminates any backoff. 756 */ 757
758 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
759 if (sk->rto > 120*HZ)
760 sk->rto = 120*HZ;
761 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about 762 .2 of a second because of BSD delayed acks - on a 100Mb/sec link 763 .2 of a second is going to need huge windows (SIGH) */ 764 sk->rto = HZ/5;
765 } 766 } 767
768 /* 769 * See if we can take anything off of the retransmit queue. 770 */ 771
772 for (;;) { 773 structsk_buff * skb = sk->send_head;
774 if (!skb)
775 break;
776
777 /* Check for a bug. */ 778 if (skb->link3 && after(skb->end_seq, skb->link3->end_seq))
779 printk("INET: tcp.c: *** bug send_list out of order.\n");
780
781 /* 782 * If our packet is before the ack sequence we can 783 * discard it as it's confirmed to have arrived the other end. 784 */ 785
786 if (after(skb->end_seq, ack))
787 break;
788
789 if (sk->retransmits)
790 { 791 /* 792 * We were retransmitting. don't count this in RTT est 793 */ 794 flag |= 2;
795 } 796
797 if ((sk->send_head = skb->link3) == NULL)
798 { 799 sk->send_tail = NULL;
800 sk->retransmits = 0;
801 } 802 /* 803 * Note that we only reset backoff and rto in the 804 * rtt recomputation code. And that doesn't happen 805 * if there were retransmissions in effect. So the 806 * first new packet after the retransmissions is 807 * sent with the backoff still in effect. Not until 808 * we get an ack from a non-retransmitted packet do 809 * we reset the backoff and rto. This allows us to deal 810 * with a situation where the network delay has increased 811 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) 812 */ 813
814 /* 815 * We have one less packet out there. 816 */ 817
818 if (sk->packets_out > 0)
819 sk->packets_out --;
820
821 if (!(flag&2)) /* Not retransmitting */ 822 tcp_rtt_estimator(sk,skb);
823 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 824 In this case as we just set it up */ 825 IS_SKB(skb);
826
827 /* 828 * We may need to remove this from the dev send list. 829 */ 830 cli();
831 if (skb->next)
832 skb_unlink(skb);
833 sti();
834 kfree_skb(skb, FREE_WRITE); /* write. */ 835 if (!sk->dead)
836 sk->write_space(sk);
837 } 838
839 /* 840 * XXX someone ought to look at this too.. at the moment, if skb_peek() 841 * returns non-NULL, we complete ignore the timer stuff in the else 842 * clause. We ought to organize the code so that else clause can 843 * (should) be executed regardless, possibly moving the PROBE timer 844 * reset over. The skb_peek() thing should only move stuff to the 845 * write queue, NOT also manage the timer functions. 846 */ 847
848 /* 849 * Maybe we can take some stuff off of the write queue, 850 * and put it onto the xmit queue. 851 */ 852 if (skb_peek(&sk->write_queue) != NULL)
853 { 854 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
855 (sk->retransmits == 0 ||
856 sk->ip_xmit_timeout != TIME_WRITE ||
857 !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
858 && sk->packets_out < sk->cong_window)
859 { 860 /* 861 * Add more data to the send queue. 862 */ 863 flag |= 1;
864 tcp_write_xmit(sk);
865 } 866 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
867 sk->send_head == NULL &&
868 sk->ack_backlog == 0 &&
869 sk->state != TCP_TIME_WAIT)
870 { 871 /* 872 * Data to queue but no room. 873 */ 874 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
875 } 876 } 877 else 878 { 879 /* 880 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets 881 * from TCP_CLOSE we don't do anything 882 * 883 * from anything else, if there is write data (or fin) pending, 884 * we use a TIME_WRITE timeout, else if keepalive we reset to 885 * a KEEPALIVE timeout, else we delete the timer. 886 * 887 * We do not set flag for nominal write data, otherwise we may 888 * force a state where we start to write itsy bitsy tidbits 889 * of data. 890 */ 891
892 switch(sk->state) { 893 caseTCP_TIME_WAIT:
894 /* 895 * keep us in TIME_WAIT until we stop getting packets, 896 * reset the timeout. 897 */ 898 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
899 break;
900 caseTCP_CLOSE:
901 /* 902 * don't touch the timer. 903 */ 904 break;
905 default:
906 /* 907 * Must check send_head and write_queue 908 * to determine which timeout to use. 909 */ 910 if (sk->send_head || !skb_queue_empty(&sk->write_queue)) { 911 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
912 }elseif (sk->keepopen) { 913 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
914 }else{ 915 del_timer(&sk->retransmit_timer);
916 sk->ip_xmit_timeout = 0;
917 } 918 break;
919 } 920 } 921
922 /* 923 * We have nothing queued but space to send. Send any partial 924 * packets immediately (end of Nagle rule application). 925 */ 926
927 if (sk->packets_out == 0
928 && sk->partial != NULL 929 && skb_queue_empty(&sk->write_queue)
930 && sk->send_head == NULL)
931 { 932 flag |= 1;
933 tcp_send_partial(sk);
934 } 935
936 /* 937 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and 938 * we are now waiting for an acknowledge to our FIN. The other end is 939 * already in TIME_WAIT. 940 * 941 * Move to TCP_CLOSE on success. 942 */ 943
944 if (sk->state == TCP_LAST_ACK)
945 { 946 if (!sk->dead)
947 sk->state_change(sk);
948 if(sk->debug)
949 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
950 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
951 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
952 { 953 flag |= 1;
954 sk->shutdown = SHUTDOWN_MASK;
955 tcp_set_state(sk,TCP_CLOSE);
956 return 1;
957 } 958 } 959
960 /* 961 * Incoming ACK to a FIN we sent in the case of our initiating the close. 962 * 963 * Move to FIN_WAIT2 to await a FIN from the other end. Set 964 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. 965 */ 966
967 if (sk->state == TCP_FIN_WAIT1)
968 { 969
970 if (!sk->dead)
971 sk->state_change(sk);
972 if (sk->rcv_ack_seq == sk->write_seq)
973 { 974 flag |= 1;
975 sk->shutdown |= SEND_SHUTDOWN;
976 tcp_set_state(sk, TCP_FIN_WAIT2);
977 } 978 } 979
980 /* 981 * Incoming ACK to a FIN we sent in the case of a simultaneous close. 982 * 983 * Move to TIME_WAIT 984 */ 985
986 if (sk->state == TCP_CLOSING)
987 { 988
989 if (!sk->dead)
990 sk->state_change(sk);
991 if (sk->rcv_ack_seq == sk->write_seq)
992 { 993 flag |= 1;
994 tcp_time_wait(sk);
995 } 996 } 997
998 /* 999 * Final ack of a three way shake 1000 */1001
1002 if(sk->state==TCP_SYN_RECV)
1003 {1004 tcp_set_state(sk, TCP_ESTABLISHED);
1005 tcp_options(sk,th);
1006 sk->dummy_th.dest=th->source;
1007 sk->copied_seq = sk->acked_seq;
1008 if(!sk->dead)
1009 sk->state_change(sk);
1010 if(sk->max_window==0)
1011 {1012 sk->max_window=32; /* Sanity check */1013 sk->mss=min(sk->max_window,sk->mtu);
1014 }1015 }1016
1017 /*1018 * I make no guarantees about the first clause in the following1019 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under1020 * what conditions "!flag" would be true. However I think the rest1021 * of the conditions would prevent that from causing any1022 * unnecessary retransmission. 1023 * Clearly if the first packet has expired it should be 1024 * retransmitted. The other alternative, "flag&2 && retransmits", is1025 * harder to explain: You have to look carefully at how and when the1026 * timer is set and with what timeout. The most recent transmission always1027 * sets the timer. So in general if the most recent thing has timed1028 * out, everything before it has as well. So we want to go ahead and1029 * retransmit some more. If we didn't explicitly test for this1030 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"1031 * would not be true. If you look at the pattern of timing, you can1032 * show that rto is increased fast enough that the next packet would1033 * almost never be retransmitted immediately. Then you'd end up1034 * waiting for a timeout to send each packet on the retransmission1035 * queue. With my implementation of the Karn sampling algorithm,1036 * the timeout would double each time. The net result is that it would1037 * take a hideous amount of time to recover from a single dropped packet.1038 * It's possible that there should also be a test for TIME_WRITE, but1039 * I think as long as "send_head != NULL" and "retransmit" is on, we've1040 * got to be in real retransmission mode.1041 * Note that tcp_do_retransmit is called with all==1. Setting cong_window1042 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.1043 * As long as no further losses occur, this seems reasonable.1044 */1045
1046 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1047 (((flag&2) && sk->retransmits) ||
1048 (flag&8) ||
1049 (sk->send_head->when + sk->rto < jiffies)))
1050 {1051 if(sk->send_head->when + sk->rto < jiffies)
1052 tcp_retransmit(sk,0);
1053 else1054 {1055 tcp_do_retransmit(sk, 1);
1056 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1057 }1058 }1059
1060 return 1;
1061
1062 uninteresting_ack:
1063 if(sk->debug)
1064 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1065
1066 /*1067 * Keepalive processing.1068 */1069
1070 if (after(ack, sk->sent_seq))
1071 {1072 return 0;
1073 }1074
1075 /*1076 * Restart the keepalive timer.1077 */1078
1079 if (sk->keepopen)
1080 {1081 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1082 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1083 }1084 return 1;
1085 }1086
1087
1088 /*1089 * Process the FIN bit. This now behaves as it is supposed to work1090 * and the FIN takes effect when it is validly part of sequence1091 * space. Not before when we get holes.1092 *1093 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT1094 * (and thence onto LAST-ACK and finally, CLOSE, we never enter1095 * TIME-WAIT)1096 *1097 * If we are in FINWAIT-1, a received FIN indicates simultaneous1098 * close and we go into CLOSING (and later onto TIME-WAIT)1099 *1100 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.1101 *1102 */1103
1104 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */1105 {1106 sk->fin_seq = skb->end_seq;
1107
1108 if (!sk->dead)
1109 {1110 sk->state_change(sk);
1111 sock_wake_async(sk->socket, 1);
1112 }1113
1114 switch(sk->state)
1115 {1116 caseTCP_SYN_RECV:
1117 caseTCP_SYN_SENT:
1118 caseTCP_ESTABLISHED:
1119 /*1120 * move to CLOSE_WAIT, tcp_data() already handled1121 * sending the ack.1122 */1123 tcp_set_state(sk,TCP_CLOSE_WAIT);
1124 if (th->rst)
1125 sk->shutdown = SHUTDOWN_MASK;
1126 break;
1127
1128 caseTCP_CLOSE_WAIT:
1129 caseTCP_CLOSING:
1130 /*1131 * received a retransmission of the FIN, do1132 * nothing.1133 */1134 break;
1135 caseTCP_TIME_WAIT:
1136 /*1137 * received a retransmission of the FIN,1138 * restart the TIME_WAIT timer.1139 */1140 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1141 return(0);
1142 caseTCP_FIN_WAIT1:
1143 /*1144 * This case occurs when a simultaneous close1145 * happens, we must ack the received FIN and1146 * enter the CLOSING state.1147 *1148 * This causes a WRITE timeout, which will either1149 * move on to TIME_WAIT when we timeout, or resend1150 * the FIN properly (maybe we get rid of that annoying1151 * FIN lost hang). The TIME_WRITE code is already correct1152 * for handling this timeout.1153 */1154
1155 if(sk->ip_xmit_timeout != TIME_WRITE)
1156 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1157 tcp_set_state(sk,TCP_CLOSING);
1158 break;
1159 caseTCP_FIN_WAIT2:
1160 /*1161 * received a FIN -- send ACK and enter TIME_WAIT1162 */1163 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1164 sk->shutdown|=SHUTDOWN_MASK;
1165 tcp_set_state(sk,TCP_TIME_WAIT);
1166 break;
1167 caseTCP_CLOSE:
1168 /*1169 * already in CLOSE1170 */1171 break;
1172 default:
1173 tcp_set_state(sk,TCP_LAST_ACK);
1174
1175 /* Start the timers. */1176 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1177 return(0);
1178 }1179
1180 return(0);
1181 }1182
1183 /*1184 * Add a sk_buff to the TCP receive queue, calculating1185 * the ACK sequence as we go..1186 */1187 staticinlinevoidtcp_insert_skb(structsk_buff * skb, structsk_buff_head * list)
/* */1188 {1189 structsk_buff * prev, * next;
1190 u32seq;
1191
1192 /*1193 * Find where the new skb goes.. (This goes backwards,1194 * on the assumption that we get the packets in order)1195 */1196 seq = skb->seq;
1197 prev = list->prev;
1198 next = (structsk_buff *) list;
1199 for (;;) {1200 if (prev == (structsk_buff *) list || !after(prev->seq, seq))
1201 break;
1202 next = prev;
1203 prev = prev->prev;
1204 }1205 __skb_insert(skb, prev, next, list);
1206 }1207
1208 /*1209 * Called for each packet when we find a new ACK endpoint sequence in it1210 */1211 staticinlineu32tcp_queue_ack(structsk_buff * skb, structsock * sk)
/* */1212 {1213 /*1214 * When we ack the fin, we do the FIN 1215 * processing.1216 */1217 skb->acked = 1;
1218 if (skb->h.th->fin)
1219 tcp_fin(skb,sk,skb->h.th);
1220 returnskb->end_seq;
1221 }1222
1223 staticvoidtcp_queue(structsk_buff * skb, structsock * sk, structtcphdr *th)
/* */1224 {1225 u32ack_seq;
1226
1227 tcp_insert_skb(skb, &sk->receive_queue);
1228
1229 /*1230 * Did we get anything new to ack?1231 */1232 ack_seq = sk->acked_seq;
1233
1234
1235 if (!after(skb->seq, ack_seq)) {1236 if (after(skb->end_seq, ack_seq)) {1237 /* the packet straddles our window end */1238 structsk_buff_head * list = &sk->receive_queue;
1239 structsk_buff * next;
1240 ack_seq = tcp_queue_ack(skb, sk);
1241
1242 /*1243 * Do we have any old packets to ack that the above1244 * made visible? (Go forward from skb)1245 */1246 next = skb->next;
1247 while (next != (structsk_buff *) list) {1248 if (after(next->seq, ack_seq))
1249 break;
1250 if (after(next->end_seq, ack_seq))
1251 ack_seq = tcp_queue_ack(next, sk);
1252 next = next->next;
1253 }1254
1255 /*1256 * Ok, we found new data, update acked_seq as1257 * necessary (and possibly send the actual1258 * ACK packet).1259 */1260 sk->acked_seq = ack_seq;
1261
1262 }else{1263 if (sk->debug)
1264 printk("Ack duplicate packet.\n");
1265 tcp_send_ack(sk);
1266 return;
1267 }1268
1269
1270 /*1271 * Delay the ack if possible. Send ack's to1272 * fin frames immediately as there shouldn't be1273 * anything more to come.1274 */1275 if (!sk->delay_acks || th->fin) {1276 tcp_send_ack(sk);
1277 }else{1278 /*1279 * If psh is set we assume it's an1280 * interactive session that wants quick1281 * acks to avoid nagling too much. 1282 */1283 intdelay = HZ/2;
1284 if (th->psh)
1285 delay = HZ/50;
1286 tcp_send_delayed_ack(sk, delay);
1287 }1288
1289 /*1290 * Tell the user we have some more data.1291 */1292
1293 if (!sk->dead)
1294 sk->data_ready(sk,0);
1295
1296 }1297 else1298 {1299 /*1300 * If we've missed a packet, send an ack.1301 * Also start a timer to send another.1302 *1303 * 4.3reno machines look for these kind of acks so1304 * they can do fast recovery. Three identical 'old'1305 * acks lets it know that one frame has been lost1306 * and should be resent. Because this is before the1307 * whole window of data has timed out it can take1308 * one lost frame per window without stalling.1309 * [See Jacobson RFC1323, Stevens TCP/IP illus vol2]1310 *1311 * We also should be spotting triple bad sequences.1312 * [We now do this.]1313 *1314 */1315
1316 if (!skb->acked)
1317 {1318 if(sk->debug)
1319 printk("Ack past end of seq packet.\n");
1320 tcp_send_ack(sk);
1321 sk->ack_backlog++;
1322 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1323 }1324 }1325 }1326
1327
1328 /*1329 * This routine handles the data. If there is room in the buffer,1330 * it will be have already been moved into it. If there is no1331 * room, then we will just have to discard the packet.1332 */1333
1334 staticinttcp_data(structsk_buff *skb, structsock *sk,
/* */1335 unsignedlongsaddr, unsignedintlen)
1336 {1337 structtcphdr *th;
1338 u32new_seq, shut_seq;
1339
1340 th = skb->h.th;
1341 skb_pull(skb,th->doff*4);
1342 skb_trim(skb,len-(th->doff*4));
1343
1344 /*1345 * The bytes in the receive read/assembly queue has increased. Needed for the1346 * low memory discard algorithm 1347 */1348
1349 sk->bytes_rcv += skb->len;
1350
1351 if (skb->len == 0 && !th->fin)
1352 {1353 /* 1354 * Don't want to keep passing ack's back and forth. 1355 * (someone sent us dataless, boring frame)1356 */1357 if (!th->ack)
1358 tcp_send_ack(sk);
1359 kfree_skb(skb, FREE_READ);
1360 return(0);
1361 }1362
1363 /*1364 * We no longer have anyone receiving data on this connection.1365 */1366
1367 #ifndef TCP_DONT_RST_SHUTDOWN
1368
1369 if(sk->shutdown & RCV_SHUTDOWN)
1370 {1371 /*1372 * FIXME: BSD has some magic to avoid sending resets to1373 * broken 4.2 BSD keepalives. Much to my surprise a few non1374 * BSD stacks still have broken keepalives so we want to1375 * cope with it.1376 */1377
1378 if(skb->len) /* We don't care if it's just an ack or1379 a keepalive/window probe */1380 {1381 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */1382
1383 /* Do this the way 4.4BSD treats it. Not what I'd1384 regard as the meaning of the spec but it's what BSD1385 does and clearly they know everything 8) */1386
1387 /*1388 * This is valid because of two things1389 *1390 * a) The way tcp_data behaves at the bottom.1391 * b) A fin takes effect when read not when received.1392 */1393
1394 shut_seq = sk->acked_seq+1; /* Last byte */1395
1396 if(after(new_seq,shut_seq))
1397 {1398 if(sk->debug)
1399 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1400 sk, new_seq, shut_seq, sk->blog);
1401 if(sk->dead)
1402 {1403 sk->acked_seq = new_seq + th->fin;
1404 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1405 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1406 tcp_statistics.TcpEstabResets++;
1407 sk->err = EPIPE;
1408 sk->error_report(sk);
1409 sk->shutdown = SHUTDOWN_MASK;
1410 tcp_set_state(sk,TCP_CLOSE);
1411 kfree_skb(skb, FREE_READ);
1412 return 0;
1413 }1414 }1415 }1416 }1417
1418 #endif1419
1420 tcp_queue(skb, sk, th);
1421
1422 return(0);
1423 }1424
1425
1426 /*1427 * This routine is only called when we have urgent data1428 * signalled. Its the 'slow' part of tcp_urg. It could be1429 * moved inline now as tcp_urg is only called from one1430 * place. We handle URGent data wrong. We have to - as1431 * BSD still doesn't use the correction from RFC961.1432 */1433
1434 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */1435 {1436 u32ptr = ntohs(th->urg_ptr);
1437
1438 if (ptr)
1439 ptr--;
1440 ptr += ntohl(th->seq);
1441
1442 /* ignore urgent data that we've already seen and read */1443 if (after(sk->copied_seq, ptr))
1444 return;
1445
1446 /* do we already have a newer (or duplicate) urgent pointer? */1447 if (sk->urg_data && !after(ptr, sk->urg_seq))
1448 return;
1449
1450 /* tell the world about our new urgent pointer */1451 if (sk->proc != 0) {1452 if (sk->proc > 0) {1453 kill_proc(sk->proc, SIGURG, 1);
1454 }else{1455 kill_pg(-sk->proc, SIGURG, 1);
1456 }1457 }1458 sk->urg_data = URG_NOTYET;
1459 sk->urg_seq = ptr;
1460 }1461
1462 /*1463 * This is the 'fast' part of urgent handling.1464 */1465
1466 staticinlinevoidtcp_urg(structsock *sk, structtcphdr *th, unsignedlonglen)
/* */1467 {1468 /*1469 * Check if we get a new urgent pointer - normally not 1470 */1471
1472 if (th->urg)
1473 tcp_check_urg(sk,th);
1474
1475 /*1476 * Do we wait for any urgent data? - normally not1477 */1478
1479 if (sk->urg_data == URG_NOTYET) {1480 u32ptr;
1481
1482 /*1483 * Is the urgent pointer pointing into this packet? 1484 */1485 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1486 if (ptr < len) {1487 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
1488 if (!sk->dead)
1489 sk->data_ready(sk,0);
1490 }1491 }1492 }1493
1494 /*1495 * This should be a bit smarter and remove partially1496 * overlapping stuff too, but this should be good1497 * enough for any even remotely normal case (and the1498 * worst that can happen is that we have a few1499 * unnecessary packets in the receive queue).1500 *1501 * This function is never called with an empty list..1502 */1503 staticinlinevoidtcp_remove_dups(structsk_buff_head * list)
/* */1504 {1505 structsk_buff * next = list->next;
1506
1507 for (;;) {1508 structsk_buff * skb = next;
1509 next = next->next;
1510 if (next == (structsk_buff *) list)
1511 break;
1512 if (before(next->end_seq, skb->end_seq)) {1513 __skb_unlink(next, list);
1514 kfree_skb(next, FREE_READ);
1515 next = skb;
1516 continue;
1517 }1518 if (next->seq != skb->seq)
1519 continue;
1520 __skb_unlink(skb, list);
1521 kfree_skb(skb, FREE_READ);
1522 }1523 }1524
1525 /*1526 * Throw out all unnecessary packets: we've gone over the1527 * receive queue limit. This shouldn't happen in a normal1528 * TCP connection, but we might have gotten duplicates etc.1529 */1530 staticvoidprune_queue(structsk_buff_head * list)
/* */1531 {1532 for (;;) {1533 structsk_buff * skb = list->prev;
1534
1535 /* gone through it all? */1536 if (skb == (structsk_buff *) list)
1537 break;
1538 if (!skb->acked) {1539 __skb_unlink(skb, list);
1540 kfree_skb(skb, FREE_READ);
1541 continue;
1542 }1543 tcp_remove_dups(list);
1544 break;
1545 }1546 }1547
1548 /*1549 * A TCP packet has arrived.1550 * skb->h.raw is the TCP header.1551 */1552
1553 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */1554 __u32daddr, unsignedshortlen,
1555 __u32saddr, intredo, structinet_protocol * protocol)
1556 {1557 structtcphdr *th;
1558 structsock *sk;
1559 intsyn_ok=0;
1560
1561 /*1562 * "redo" is 1 if we have already seen this skb but couldn't1563 * use it at that time (the socket was locked). In that case1564 * we have already done a lot of the work (looked up the socket1565 * etc).1566 */1567 th = skb->h.th;
1568 sk = skb->sk;
1569 if (!redo) {1570 tcp_statistics.TcpInSegs++;
1571 if (skb->pkt_type!=PACKET_HOST)
1572 gotodiscard_it;
1573
1574 /*1575 * Pull up the IP header.1576 */1577
1578 skb_pull(skb, skb->h.raw-skb->data);
1579
1580 /*1581 * Try to use the device checksum if provided.1582 */1583 switch (skb->ip_summed)
1584 {1585 caseCHECKSUM_NONE:
1586 skb->csum = csum_partial((char *)th, len, 0);
1587 caseCHECKSUM_HW:
1588 if (tcp_check(th, len, saddr, daddr, skb->csum))
1589 gotodiscard_it;
1590 default:
1591 /* CHECKSUM_UNNECESSARY */1592 }1593 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1594 if (!sk)
1595 gotono_tcp_socket;
1596 skb->sk = sk;
1597 skb->seq = ntohl(th->seq);
1598 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1599 skb->ack_seq = ntohl(th->ack_seq);
1600
1601 skb->acked = 0;
1602 skb->used = 0;
1603 skb->free = 1;
1604 skb->saddr = daddr;
1605 skb->daddr = saddr;
1606
1607 /*1608 * We may need to add it to the backlog here. 1609 */1610 if (sk->users)
1611 {1612 __skb_queue_tail(&sk->back_log, skb);
1613 return(0);
1614 }1615 }1616
1617 /*1618 * If this socket has got a reset it's to all intents and purposes 1619 * really dead. Count closed sockets as dead.1620 *1621 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD1622 * simply drops data. This seems incorrect as a 'closed' TCP doesn't1623 * exist so should cause resets as if the port was unreachable.1624 */1625
1626 if (sk->zapped || sk->state==TCP_CLOSE)
1627 gotono_tcp_socket;
1628
1629 if (!sk->prot)
1630 {1631 printk("IMPOSSIBLE 3\n");
1632 return(0);
1633 }1634
1635
1636 /*1637 * Charge the memory to the socket. 1638 */1639
1640 skb->sk=sk;
1641 atomic_add(skb->truesize, &sk->rmem_alloc);
1642
1643 /*1644 * We should now do header prediction.1645 */1646
1647 /*1648 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We1649 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug1650 * compatibility. We also set up variables more thoroughly [Karn notes in the1651 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].1652 */1653
1654 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */1655 {1656
1657 /*1658 * Now deal with unusual cases.1659 */1660
1661 if(sk->state==TCP_LISTEN)
1662 {1663 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */1664 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1665
1666 /*1667 * We don't care for RST, and non SYN are absorbed (old segments)1668 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the1669 * netmask on a running connection it can go broadcast. Even Sun's have1670 * this problem so I'm ignoring it 1671 */1672
1673 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1674 {1675 kfree_skb(skb, FREE_READ);
1676 return 0;
1677 }1678
1679 /* 1680 * Guess we need to make a new socket up 1681 */1682
1683 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1684
1685 /*1686 * Now we have several options: In theory there is nothing else1687 * in the frame. KA9Q has an option to send data with the syn,1688 * BSD accepts data with the syn up to the [to be] advertised window1689 * and Solaris 2.1 gives you a protocol error. For now we just ignore1690 * it, that fits the spec precisely and avoids incompatibilities. It1691 * would be nice in future to drop through and process the data.1692 *1693 * Now TTCP is starting to use we ought to queue this data.1694 */1695
1696 return 0;
1697 }1698
1699 /* 1700 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN1701 * then its a new connection1702 */1703
1704 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1705 {1706 kfree_skb(skb, FREE_READ);
1707 return 0;
1708 }1709
1710 /*1711 * SYN sent means we have to look for a suitable ack and either reset1712 * for bad matches or go to connected. The SYN_SENT case is unusual and should1713 * not be in line code. [AC]1714 */1715
1716 if(sk->state==TCP_SYN_SENT)
1717 {1718 /* Crossed SYN or previous junk segment */1719 if(th->ack)
1720 {1721 /* We got an ack, but it's not a good ack */1722 if(!tcp_ack(sk,th,skb->ack_seq,len))
1723 {1724 /* Reset the ack - its an ack from a 1725 different connection [ th->rst is checked in tcp_send_reset()] */1726 tcp_statistics.TcpAttemptFails++;
1727 tcp_send_reset(daddr, saddr, th,
1728 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1729 kfree_skb(skb, FREE_READ);
1730 return(0);
1731 }1732 if(th->rst)
1733 returntcp_reset(sk,skb);
1734 if(!th->syn)
1735 {1736 /* A valid ack from a different connection1737 start. Shouldn't happen but cover it */1738 tcp_statistics.TcpAttemptFails++;
1739 tcp_send_reset(daddr, saddr, th,
1740 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1741 kfree_skb(skb, FREE_READ);
1742 return 0;
1743 }1744 /*1745 * Ok.. it's good. Set up sequence numbers and1746 * move to established.1747 */1748 syn_ok=1; /* Don't reset this connection for the syn */1749 sk->acked_seq = skb->seq+1;
1750 sk->lastwin_seq = skb->seq+1;
1751 sk->fin_seq = skb->seq;
1752 tcp_send_ack(sk);
1753 tcp_set_state(sk, TCP_ESTABLISHED);
1754 tcp_options(sk,th);
1755 sk->dummy_th.dest=th->source;
1756 sk->copied_seq = sk->acked_seq;
1757 if(!sk->dead)
1758 {1759 sk->state_change(sk);
1760 sock_wake_async(sk->socket, 0);
1761 }1762 if(sk->max_window==0)
1763 {1764 sk->max_window = 32;
1765 sk->mss = min(sk->max_window, sk->mtu);
1766 }1767 }1768 else1769 {1770 /* See if SYN's cross. Drop if boring */1771 if(th->syn && !th->rst)
1772 {1773 /* Crossed SYN's are fine - but talking to1774 yourself is right out... */1775 if(sk->saddr==saddr && sk->daddr==daddr &&
1776 sk->dummy_th.source==th->source &&
1777 sk->dummy_th.dest==th->dest)
1778 {1779 tcp_statistics.TcpAttemptFails++;
1780 returntcp_reset(sk,skb);
1781 }1782 tcp_set_state(sk,TCP_SYN_RECV);
1783
1784 /*1785 * FIXME:1786 * Must send SYN|ACK here1787 */1788 }1789 /* Discard junk segment */1790 kfree_skb(skb, FREE_READ);
1791 return 0;
1792 }1793 /*1794 * SYN_RECV with data maybe.. drop through1795 */1796 gotorfc_step6;
1797 }1798
1799 /*1800 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is1801 * a more complex suggestion for fixing these reuse issues in RFC16441802 * but not yet ready for general use. Also see RFC1379.1803 *1804 * Note the funny way we go back to the top of this function for1805 * this case ("goto try_next_socket"). That also takes care of1806 * checking "sk->users" for the new socket as well as doing all1807 * the normal tests on the packet.1808 */1809
1810 #defineBSD_TIME_WAIT1811 #ifdefBSD_TIME_WAIT1812 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1813 after(skb->seq, sk->acked_seq) && !th->rst)
1814 {1815 u32seq = sk->write_seq;
1816 if(sk->debug)
1817 printk("Doing a BSD time wait\n");
1818 tcp_statistics.TcpEstabResets++;
1819 atomic_sub(skb->truesize, &sk->rmem_alloc);
1820 skb->sk = NULL;
1821 sk->err=ECONNRESET;
1822 tcp_set_state(sk, TCP_CLOSE);
1823 sk->shutdown = SHUTDOWN_MASK;
1824 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1825 /* this is not really correct: we should check sk->users */1826 if (sk && sk->state==TCP_LISTEN)
1827 {1828 skb->sk = sk;
1829 atomic_add(skb->truesize, &sk->rmem_alloc);
1830 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1831 return 0;
1832 }1833 kfree_skb(skb, FREE_READ);
1834 return 0;
1835 }1836 #endif1837 }1838
1839 /*1840 * We are now in normal data flow (see the step list in the RFC)1841 * Note most of these are inline now. I'll inline the lot when1842 * I have time to test it hard and look at what gcc outputs 1843 */1844
1845 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1846 {1847 bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
1848 kfree_skb(skb, FREE_READ);
1849 return 0;
1850 }1851
1852 if(th->rst)
1853 returntcp_reset(sk,skb);
1854
1855 /*1856 * !syn_ok is effectively the state test in RFC793.1857 */1858
1859 if(th->syn && !syn_ok)
1860 {1861 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1862 returntcp_reset(sk,skb);
1863 }1864
1865 tcp_delack_estimator(sk);
1866
1867 /*1868 * Process the ACK1869 */1870
1871
1872 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1873 {1874 /*1875 * Our three way handshake failed.1876 */1877
1878 if(sk->state==TCP_SYN_RECV)
1879 {1880 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1881 }1882 kfree_skb(skb, FREE_READ);
1883 return 0;
1884 }1885
1886 rfc_step6: /* I'll clean this up later */1887
1888 /*1889 * If the accepted buffer put us over our queue size we1890 * now drop it (we must process the ack first to avoid1891 * deadlock cases).1892 */1893
1894 /*1895 * Process urgent data1896 */1897
1898 tcp_urg(sk, th, len);
1899
1900 /*1901 * Process the encapsulated data1902 */1903
1904 if(tcp_data(skb,sk, saddr, len))
1905 kfree_skb(skb, FREE_READ);
1906
1907 /*1908 * If our receive queue has grown past its limits,1909 * try to prune away duplicates etc..1910 */1911 if (sk->rmem_alloc > sk->rcvbuf)
1912 prune_queue(&sk->receive_queue);
1913
1914 /*1915 * And done1916 */1917
1918 return 0;
1919
1920 no_tcp_socket:
1921 /*1922 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)1923 */1924 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1925
1926 discard_it:
1927 /*1928 * Discard frame1929 */1930 skb->sk = NULL;
1931 kfree_skb(skb, FREE_READ);
1932 return 0;
1933 }