1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp_input.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * FIXES 23 * Pedro Roque : Double ACK bug 24 */ 25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 /* 30 * Policy code extracted so its now separate 31 */ 32
33 /* 34 * Called each time to estimate the delayed ack timeout. This is 35 * how it should be done so a fast link isn't impacted by ack delay. 36 */ 37
38 extern__inline__voidtcp_delack_estimator(structsock *sk)
/* */ 39 { 40 /* 41 * Delayed ACK time estimator. 42 */ 43
44 if (sk->lrcvtime == 0)
45 { 46 sk->lrcvtime = jiffies;
47 sk->ato = HZ/3;
48 } 49 else 50 { 51 intm;
52
53 m = jiffies - sk->lrcvtime;
54
55 sk->lrcvtime = jiffies;
56
57 if (m <= 0)
58 m = 1;
59
60 if (m > (sk->rtt >> 3))
61 { 62 sk->ato = sk->rtt >> 3;
63 /* 64 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); 65 */ 66 } 67 else 68 { 69 sk->ato = (sk->ato >> 1) + m;
70 /* 71 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); 72 */ 73 } 74 } 75 } 76
77 /* 78 * Called on frames that were known _not_ to have been 79 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 80 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. 81 */ 82
83 extern__inline__voidtcp_rtt_estimator(structsock *sk, structsk_buff *oskb)
/* */ 84 { 85 longm;
86 /* 87 * The following amusing code comes from Jacobson's 88 * article in SIGCOMM '88. Note that rtt and mdev 89 * are scaled versions of rtt and mean deviation. 90 * This is designed to be as fast as possible 91 * m stands for "measurement". 92 */ 93
94 m = jiffies - oskb->when; /* RTT */ 95 if(m<=0)
96 m=1; /* IS THIS RIGHT FOR <0 ??? */ 97 m -= (sk->rtt >> 3); /* m is now error in rtt est */ 98 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ 99 if (m < 0)
100 m = -m; /* m is now abs(error) */ 101 m -= (sk->mdev >> 2); /* similar update on mdev */ 102 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 103
104 /* 105 * Now update timeout. Note that this removes any backoff. 106 */ 107
108 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
109 if (sk->rto > 120*HZ)
110 sk->rto = 120*HZ;
111 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ 112 sk->rto = HZ/5;
113 sk->backoff = 0;
114 } 115
116 /* 117 * Cached last hit socket 118 */ 119
120 staticvolatileunsignedlongth_cache_saddr, th_cache_daddr;
121 staticvolatileunsignedshortth_cache_dport, th_cache_sport;
122 staticvolatilestructsock *th_cache_sk;
123
124 voidtcp_cache_zap(void)
/* */ 125 { 126 th_cache_sk=NULL;
127 } 128
129 /* 130 * Find the socket, using the last hit cache if applicable. The cache is not quite 131 * right... 132 */ 133
134 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */ 135 { 136 structsock * sk;
137
138 sk = (structsock *) th_cache_sk;
139 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
140 sport != th_cache_sport || dport != th_cache_dport) { 141 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
142 if (sk) { 143 th_cache_saddr=saddr;
144 th_cache_daddr=daddr;
145 th_cache_dport=dport;
146 th_cache_sport=sport;
147 th_cache_sk=sk;
148 } 149 } 150 returnsk;
151 } 152
153 /* 154 * React to a out-of-window TCP sequence number in an incoming packet 155 */ 156
157 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, u32end_seq,
/* */ 158 structdevice *dev)
159 { 160 if (th->rst)
161 return;
162
163 /* 164 * Send a reset if we get something not ours and we are 165 * unsynchronized. Note: We don't do anything to our end. We 166 * are just killing the bogus remote connection then we will 167 * connect again and it will work (with luck). 168 */ 169
170 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
171 { 172 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
173 return;
174 } 175
176 /* 177 * 4.3reno machines look for these kind of acks so they can do fast 178 * recovery. Three identical 'old' acks lets it know that one frame has 179 * been lost and should be resent. Because this is before the whole window 180 * of data has timed out it can take one lost frame per window without 181 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2] 182 */ 183 tcp_send_ack(sk);
184 } 185
186 /* 187 * This functions checks to see if the tcp header is actually acceptable. 188 */ 189
190 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */ 191 { 192 u32end_window = sk->acked_seq + sk->window;
193 return/* if start is at end of window, end must be too (zero window) */ 194 (seq == end_window && seq == end_seq) ||
195 /* if start is before end of window, check for interest */ 196 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
197 } 198
199 /* 200 * When we get a reset we do this. This probably is a tcp_output routine 201 * really. 202 */ 203
204 staticinttcp_reset(structsock *sk, structsk_buff *skb)
/* */ 205 { 206 sk->zapped = 1;
207 /* 208 * We want the right error as BSD sees it (and indeed as we do). 209 */ 210 sk->err = ECONNRESET;
211 if (sk->state == TCP_SYN_SENT)
212 sk->err = ECONNREFUSED;
213 if (sk->state == TCP_CLOSE_WAIT)
214 sk->err = EPIPE;
215 #ifdef CONFIG_TCP_RFC1337
216 /* 217 * Time wait assassination protection [RFC1337] 218 * 219 * This is a good idea, but causes more sockets to take time to close. 220 * 221 * Ian Heavens has since shown this is an inadequate fix for the protocol 222 * bug in question. 223 */ 224 if(sk->state!=TCP_TIME_WAIT)
225 { 226 tcp_set_state(sk,TCP_CLOSE);
227 sk->shutdown = SHUTDOWN_MASK;
228 } 229 #else 230 tcp_set_state(sk,TCP_CLOSE);
231 sk->shutdown = SHUTDOWN_MASK;
232 #endif 233 if (!sk->dead)
234 sk->state_change(sk);
235 kfree_skb(skb, FREE_READ);
236 return(0);
237 } 238
239
240 /* 241 * Look for tcp options. Parses everything but only knows about MSS. 242 * This routine is always called with the packet containing the SYN. 243 * However it may also be called with the ack to the SYN. So you 244 * can't assume this is always the SYN. It's always called after 245 * we have set up sk->mtu to our own MTU. 246 * 247 * We need at minimum to add PAWS support here. Possibly large windows 248 * as Linux gets deployed on 100Mb/sec networks. 249 */ 250
251 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */ 252 { 253 unsignedchar *ptr;
254 intlength=(th->doff*4)-sizeof(structtcphdr);
255 intmss_seen = 0;
256
257 ptr = (unsignedchar *)(th + 1);
258
259 while(length>0)
260 { 261 intopcode=*ptr++;
262 intopsize=*ptr++;
263 switch(opcode)
264 { 265 caseTCPOPT_EOL:
266 return;
267 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 268 length--;
269 ptr--; /* the opsize=*ptr++ above was a mistake */ 270 continue;
271
272 default:
273 if(opsize<=2) /* Avoid silly options looping forever */ 274 return;
275 switch(opcode)
276 { 277 caseTCPOPT_MSS:
278 if(opsize==4 && th->syn)
279 { 280 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
281 mss_seen = 1;
282 } 283 break;
284 /* Add other options here as people feel the urge to implement stuff like large windows */ 285 } 286 ptr+=opsize-2;
287 length-=opsize;
288 } 289 } 290 if (th->syn)
291 { 292 if (! mss_seen)
293 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ 294 } 295 #ifdefCONFIG_INET_PCTCP 296 sk->mss = min(sk->max_window >> 1, sk->mtu);
297 #else 298 sk->mss = min(sk->max_window, sk->mtu);
299 sk->max_unacked = 2 * sk->mss;
300 #endif 301 } 302
303
304 /* 305 * This routine handles a connection request. 306 * It should make sure we haven't already responded. 307 * Because of the way BSD works, we have to send a syn/ack now. 308 * This also means it will be harder to close a socket which is 309 * listening. 310 */ 311
312 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */ 313 u32daddr, u32saddr, structoptions *opt, structdevice *dev, u32seq)
314 { 315 structsock *newsk;
316 structtcphdr *th;
317 structrtable *rt;
318
319 th = skb->h.th;
320
321 /* If the socket is dead, don't accept the connection. */ 322 if (!sk->dead)
323 { 324 sk->data_ready(sk,0);
325 } 326 else 327 { 328 if(sk->debug)
329 printk("Reset on %p: Connect on dead socket.\n",sk);
330 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
331 tcp_statistics.TcpAttemptFails++;
332 kfree_skb(skb, FREE_READ);
333 return;
334 } 335
336 /* 337 * Make sure we can accept more. This will prevent a 338 * flurry of syns from eating up all our memory. 339 * 340 * BSD does some funnies here and allows 3/2 times the 341 * set backlog as a fudge factor. Thats just too gross. 342 */ 343
344 if (sk->ack_backlog >= sk->max_ack_backlog)
345 { 346 tcp_statistics.TcpAttemptFails++;
347 kfree_skb(skb, FREE_READ);
348 return;
349 } 350
351 /* 352 * We need to build a new sock struct. 353 * It is sort of bad to have a socket without an inode attached 354 * to it, but the wake_up's will just wake up the listening socket, 355 * and if the listening socket is destroyed before this is taken 356 * off of the queue, this will take care of it. 357 */ 358
359 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
360 if (newsk == NULL)
361 { 362 /* just ignore the syn. It will get retransmitted. */ 363 tcp_statistics.TcpAttemptFails++;
364 kfree_skb(skb, FREE_READ);
365 return;
366 } 367
368 memcpy(newsk, sk, sizeof(*newsk));
369 newsk->opt = NULL;
370 newsk->ip_route_cache = NULL;
371 if (opt && opt->optlen)
372 { 373 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
374 if (!sk->opt)
375 { 376 kfree_s(newsk, sizeof(structsock));
377 tcp_statistics.TcpAttemptFails++;
378 kfree_skb(skb, FREE_READ);
379 return;
380 } 381 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
382 { 383 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
384 kfree_s(newsk, sizeof(structsock));
385 tcp_statistics.TcpAttemptFails++;
386 kfree_skb(skb, FREE_READ);
387 return;
388 } 389 } 390 skb_queue_head_init(&newsk->write_queue);
391 skb_queue_head_init(&newsk->receive_queue);
392 newsk->send_head = NULL;
393 newsk->send_tail = NULL;
394 skb_queue_head_init(&newsk->back_log);
395 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ 396 newsk->rto = TCP_TIMEOUT_INIT;
397 newsk->mdev = 0;
398 newsk->max_window = 0;
399 newsk->cong_window = 1;
400 newsk->cong_count = 0;
401 newsk->ssthresh = 0;
402 newsk->backoff = 0;
403 newsk->blog = 0;
404 newsk->intr = 0;
405 newsk->proc = 0;
406 newsk->done = 0;
407 newsk->partial = NULL;
408 newsk->pair = NULL;
409 newsk->wmem_alloc = 0;
410 newsk->rmem_alloc = 0;
411 newsk->localroute = sk->localroute;
412
413 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
414
415 newsk->err = 0;
416 newsk->shutdown = 0;
417 newsk->ack_backlog = 0;
418 newsk->acked_seq = skb->seq+1;
419 newsk->lastwin_seq = skb->seq+1;
420 newsk->delay_acks = 1;
421 newsk->copied_seq = skb->seq+1;
422 newsk->fin_seq = skb->seq;
423 newsk->state = TCP_SYN_RECV;
424 newsk->timeout = 0;
425 newsk->ip_xmit_timeout = 0;
426 newsk->write_seq = seq;
427 newsk->window_seq = newsk->write_seq;
428 newsk->rcv_ack_seq = newsk->write_seq;
429 newsk->urg_data = 0;
430 newsk->retransmits = 0;
431 newsk->linger=0;
432 newsk->destroy = 0;
433 init_timer(&newsk->timer);
434 newsk->timer.data = (unsignedlong)newsk;
435 newsk->timer.function = &net_timer;
436 init_timer(&newsk->delack_timer);
437 newsk->delack_timer.data = (unsignedlong)newsk;
438 newsk->delack_timer.function = tcp_delack_timer;
439 init_timer(&newsk->retransmit_timer);
440 newsk->retransmit_timer.data = (unsignedlong)newsk;
441 newsk->retransmit_timer.function = tcp_retransmit_timer;
442 newsk->dummy_th.source = skb->h.th->dest;
443 newsk->dummy_th.dest = skb->h.th->source;
444
445 /* 446 * Swap these two, they are from our point of view. 447 */ 448
449 newsk->daddr = saddr;
450 newsk->saddr = daddr;
451 newsk->rcv_saddr = daddr;
452
453 put_sock(newsk->num,newsk);
454 newsk->acked_seq = skb->seq + 1;
455 newsk->copied_seq = skb->seq + 1;
456 newsk->socket = NULL;
457
458 /* 459 * Grab the ttl and tos values and use them 460 */ 461
462 newsk->ip_ttl=sk->ip_ttl;
463 newsk->ip_tos=skb->ip_hdr->tos;
464
465 /* 466 * Use 512 or whatever user asked for 467 */ 468
469 /* 470 * Note use of sk->user_mss, since user has no direct access to newsk 471 */ 472
473 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
474 newsk->ip_route_cache = rt;
475
476 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
477 newsk->window_clamp = rt->rt_window;
478 else 479 newsk->window_clamp = 0;
480
481 if (sk->user_mss)
482 newsk->mtu = sk->user_mss;
483 elseif (rt)
484 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
485 else 486 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
487
488 /* 489 * But not bigger than device MTU 490 */ 491
492 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
493
494 #ifdefCONFIG_SKIP 495
496 /* 497 * SKIP devices set their MTU to 65535. This is so they can take packets 498 * unfragmented to security process then fragment. They could lie to the 499 * TCP layer about a suitable MTU, but its easier to let skip sort it out 500 * simply because the final package we want unfragmented is going to be 501 * 502 * [IPHDR][IPSP][Security data][Modified TCP data][Security data] 503 */ 504
505 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ 506 sk->mtu=skip_pick_mtu(sk->mtu,dev);
507 #endif 508 /* 509 * This will min with what arrived in the packet 510 */ 511
512 tcp_options(newsk,skb->h.th);
513
514 tcp_cache_zap();
515 tcp_send_synack(newsk, sk, skb);
516 } 517
518
519 /* 520 * Handle a TCP window that shrunk on us. It shouldn't happen, 521 * but.. 522 * 523 * We may need to move packets from the send queue 524 * to the write queue, if the window has been shrunk on us. 525 * The RFC says you are not allowed to shrink your window 526 * like this, but if the other end does, you must be able 527 * to deal with it. 528 */ 529 voidtcp_window_shrunk(structsock * sk, u32window_seq)
/* */ 530 { 531 structsk_buff *skb;
532 structsk_buff *skb2;
533 structsk_buff *wskb = NULL;
534
535 skb2 = sk->send_head;
536 sk->send_head = NULL;
537 sk->send_tail = NULL;
538
539 /* 540 * This is an artifact of a flawed concept. We want one 541 * queue and a smarter send routine when we send all. 542 */ 543 cli();
544 while (skb2 != NULL)
545 { 546 skb = skb2;
547 skb2 = skb->link3;
548 skb->link3 = NULL;
549 if (after(skb->end_seq, window_seq))
550 { 551 if (sk->packets_out > 0)
552 sk->packets_out--;
553 /* We may need to remove this from the dev send list. */ 554 if (skb->next != NULL)
555 { 556 skb_unlink(skb);
557 } 558 /* Now add it to the write_queue. */ 559 if (wskb == NULL)
560 skb_queue_head(&sk->write_queue,skb);
561 else 562 skb_append(wskb,skb);
563 wskb = skb;
564 } 565 else 566 { 567 if (sk->send_head == NULL)
568 { 569 sk->send_head = skb;
570 sk->send_tail = skb;
571 } 572 else 573 { 574 sk->send_tail->link3 = skb;
575 sk->send_tail = skb;
576 } 577 skb->link3 = NULL;
578 } 579 } 580 sti();
581 } 582
583
584 /* 585 * This routine deals with incoming acks, but not outgoing ones. 586 * 587 * This routine is totally _WRONG_. The list structuring is wrong, 588 * the algorithm is wrong, the code is wrong. 589 */ 590
591 staticinttcp_ack(structsock *sk, structtcphdr *th, u32ack, intlen)
/* */ 592 { 593 intflag = 0;
594 u32window_seq;
595
596 /* 597 * 1 - there was data in packet as well as ack or new data is sent or 598 * in shutdown state 599 * 2 - data from retransmit queue was acked and removed 600 * 4 - window shrunk or data from retransmit queue was acked and removed 601 * 8 - we want to do a fast retransmit. One packet only. 602 */ 603
604 if(sk->zapped)
605 return(1); /* Dead, cant ack any more so why bother */ 606
607 /* 608 * We have dropped back to keepalive timeouts. Thus we have 609 * no retransmits pending. 610 */ 611
612 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
613 sk->retransmits = 0;
614
615 /* 616 * If the ack is newer than sent or older than previous acks 617 * then we can probably ignore it. 618 */ 619
620 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
621 gotouninteresting_ack;
622
623 /* 624 * If there is data set flag 1 625 */ 626
627 if (len != th->doff*4)
628 flag |= 1;
629
630 /* 631 * Have we discovered a larger window 632 */ 633 window_seq = ntohs(th->window);
634 if (window_seq > sk->max_window)
635 { 636 sk->max_window = window_seq;
637 #ifdefCONFIG_INET_PCTCP 638 /* Hack because we don't send partial packets to non SWS 639 handling hosts */ 640 sk->mss = min(window_seq>>1, sk->mtu);
641 #else 642 sk->mss = min(window_seq, sk->mtu);
643 #endif 644 } 645 window_seq += ack;
646
647 /* 648 * See if our window has been shrunk. 649 */ 650 if (after(sk->window_seq, window_seq)) { 651 flag |= 4;
652 tcp_window_shrunk(sk, window_seq);
653 } 654
655 /* 656 * Pipe has emptied 657 */ 658 if (sk->send_tail == NULL || sk->send_head == NULL)
659 { 660 sk->send_head = NULL;
661 sk->send_tail = NULL;
662 sk->packets_out= 0;
663 } 664
665 /* 666 * We don't want too many packets out there. 667 */ 668
669 if (sk->ip_xmit_timeout == TIME_WRITE &&
670 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
671 { 672
673 /* 674 * This is Jacobson's slow start and congestion avoidance. 675 * SIGCOMM '88, p. 328. Because we keep cong_window in integral 676 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 677 * counter and increment it once every cwnd times. It's possible 678 * that this should be done only if sk->retransmits == 0. I'm 679 * interpreting "new data is acked" as including data that has 680 * been retransmitted but is just now being acked. 681 */ 682 if (sk->cong_window < sk->ssthresh)
683 /* 684 * In "safe" area, increase 685 */ 686 sk->cong_window++;
687 else 688 { 689 /* 690 * In dangerous area, increase slowly. In theory this is 691 * sk->cong_window += 1 / sk->cong_window 692 */ 693 if (sk->cong_count >= sk->cong_window)
694 { 695 sk->cong_window++;
696 sk->cong_count = 0;
697 } 698 else 699 sk->cong_count++;
700 } 701 } 702
703 /* 704 * Remember the highest ack received and update the 705 * right hand window edge of the host. 706 * We do a bit of work here to track number of times we've 707 * seen this ack without a change in the right edge of the 708 * window and no data in the packet. 709 * This will allow us to do fast retransmits. 710 */ 711
712 if (sk->rcv_ack_seq == ack && sk->window_seq == window_seq && !(flag&1))
713 { 714 /* 715 * We only want to short cut this once, many 716 * ACKs may still come, we'll do a normal transmit 717 * for these ACKs. 718 */ 719 if (++sk->rcv_ack_cnt == MAX_DUP_ACKS+1)
720 flag |= 8; /* flag for a fast retransmit */ 721 } 722 else 723 { 724 sk->window_seq = window_seq;
725 sk->rcv_ack_seq = ack;
726 sk->rcv_ack_cnt = 1;
727 } 728
729 /* 730 * We passed data and got it acked, remove any soft error 731 * log. Something worked... 732 */ 733
734 sk->err_soft = 0;
735
736 /* 737 * If this ack opens up a zero window, clear backoff. It was 738 * being used to time the probes, and is probably far higher than 739 * it needs to be for normal retransmission. 740 */ 741
742 if (sk->ip_xmit_timeout == TIME_PROBE0)
743 { 744 sk->retransmits = 0; /* Our probe was answered */ 745
746 /* 747 * Was it a usable window open ? 748 */ 749
750 if (!skb_queue_empty(&sk->write_queue) && /* should always be true */ 751 ! before (sk->window_seq, sk->write_queue.next->end_seq))
752 { 753 sk->backoff = 0;
754
755 /* 756 * Recompute rto from rtt. this eliminates any backoff. 757 */ 758
759 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
760 if (sk->rto > 120*HZ)
761 sk->rto = 120*HZ;
762 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about 763 .2 of a second because of BSD delayed acks - on a 100Mb/sec link 764 .2 of a second is going to need huge windows (SIGH) */ 765 sk->rto = HZ/5;
766 } 767 } 768
769 /* 770 * See if we can take anything off of the retransmit queue. 771 */ 772
773 for (;;) { 774 structsk_buff * skb = sk->send_head;
775 if (!skb)
776 break;
777
778 /* Check for a bug. */ 779 if (skb->link3 && after(skb->end_seq, skb->link3->end_seq))
780 printk("INET: tcp.c: *** bug send_list out of order.\n");
781
782 /* 783 * If our packet is before the ack sequence we can 784 * discard it as it's confirmed to have arrived the other end. 785 */ 786
787 if (after(skb->end_seq, ack))
788 break;
789
790 if (sk->retransmits)
791 { 792 /* 793 * We were retransmitting. don't count this in RTT est 794 */ 795 flag |= 2;
796 } 797
798 if ((sk->send_head = skb->link3) == NULL)
799 { 800 sk->send_tail = NULL;
801 sk->retransmits = 0;
802 } 803 /* 804 * Note that we only reset backoff and rto in the 805 * rtt recomputation code. And that doesn't happen 806 * if there were retransmissions in effect. So the 807 * first new packet after the retransmissions is 808 * sent with the backoff still in effect. Not until 809 * we get an ack from a non-retransmitted packet do 810 * we reset the backoff and rto. This allows us to deal 811 * with a situation where the network delay has increased 812 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) 813 */ 814
815 /* 816 * We have one less packet out there. 817 */ 818
819 if (sk->packets_out > 0)
820 sk->packets_out --;
821
822 if (!(flag&2)) /* Not retransmitting */ 823 tcp_rtt_estimator(sk,skb);
824 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 825 In this case as we just set it up */ 826 IS_SKB(skb);
827
828 /* 829 * We may need to remove this from the dev send list. 830 */ 831 cli();
832 if (skb->next)
833 skb_unlink(skb);
834 sti();
835 kfree_skb(skb, FREE_WRITE); /* write. */ 836 if (!sk->dead)
837 sk->write_space(sk);
838 } 839
840 /* 841 * XXX someone ought to look at this too.. at the moment, if skb_peek() 842 * returns non-NULL, we complete ignore the timer stuff in the else 843 * clause. We ought to organize the code so that else clause can 844 * (should) be executed regardless, possibly moving the PROBE timer 845 * reset over. The skb_peek() thing should only move stuff to the 846 * write queue, NOT also manage the timer functions. 847 */ 848
849 /* 850 * Maybe we can take some stuff off of the write queue, 851 * and put it onto the xmit queue. 852 */ 853 if (skb_peek(&sk->write_queue) != NULL)
854 { 855 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
856 (sk->retransmits == 0 ||
857 sk->ip_xmit_timeout != TIME_WRITE ||
858 !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
859 && sk->packets_out < sk->cong_window)
860 { 861 /* 862 * Add more data to the send queue. 863 */ 864 flag |= 1;
865 tcp_write_xmit(sk);
866 } 867 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
868 sk->send_head == NULL &&
869 sk->ack_backlog == 0 &&
870 sk->state != TCP_TIME_WAIT)
871 { 872 /* 873 * Data to queue but no room. 874 */ 875 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
876 } 877 } 878 else 879 { 880 /* 881 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets 882 * from TCP_CLOSE we don't do anything 883 * 884 * from anything else, if there is write data (or fin) pending, 885 * we use a TIME_WRITE timeout, else if keepalive we reset to 886 * a KEEPALIVE timeout, else we delete the timer. 887 * 888 * We do not set flag for nominal write data, otherwise we may 889 * force a state where we start to write itsy bitsy tidbits 890 * of data. 891 */ 892
893 switch(sk->state) { 894 caseTCP_TIME_WAIT:
895 /* 896 * keep us in TIME_WAIT until we stop getting packets, 897 * reset the timeout. 898 */ 899 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
900 break;
901 caseTCP_CLOSE:
902 /* 903 * don't touch the timer. 904 */ 905 break;
906 default:
907 /* 908 * Must check send_head and write_queue 909 * to determine which timeout to use. 910 */ 911 if (sk->send_head || !skb_queue_empty(&sk->write_queue)) { 912 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
913 }elseif (sk->keepopen) { 914 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
915 }else{ 916 del_timer(&sk->retransmit_timer);
917 sk->ip_xmit_timeout = 0;
918 } 919 break;
920 } 921 } 922
923 /* 924 * We have nothing queued but space to send. Send any partial 925 * packets immediately (end of Nagle rule application). 926 */ 927
928 if (sk->packets_out == 0
929 && sk->partial != NULL 930 && skb_queue_empty(&sk->write_queue)
931 && sk->send_head == NULL)
932 { 933 flag |= 1;
934 tcp_send_partial(sk);
935 } 936
937 /* 938 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and 939 * we are now waiting for an acknowledge to our FIN. The other end is 940 * already in TIME_WAIT. 941 * 942 * Move to TCP_CLOSE on success. 943 */ 944
945 if (sk->state == TCP_LAST_ACK)
946 { 947 if (!sk->dead)
948 sk->state_change(sk);
949 if(sk->debug)
950 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
951 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
952 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
953 { 954 flag |= 1;
955 sk->shutdown = SHUTDOWN_MASK;
956 tcp_set_state(sk,TCP_CLOSE);
957 return 1;
958 } 959 } 960
961 /* 962 * Incoming ACK to a FIN we sent in the case of our initiating the close. 963 * 964 * Move to FIN_WAIT2 to await a FIN from the other end. Set 965 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. 966 */ 967
968 if (sk->state == TCP_FIN_WAIT1)
969 { 970
971 if (!sk->dead)
972 sk->state_change(sk);
973 if (sk->rcv_ack_seq == sk->write_seq)
974 { 975 flag |= 1;
976 sk->shutdown |= SEND_SHUTDOWN;
977 tcp_set_state(sk, TCP_FIN_WAIT2);
978 } 979 } 980
981 /* 982 * Incoming ACK to a FIN we sent in the case of a simultaneous close. 983 * 984 * Move to TIME_WAIT 985 */ 986
987 if (sk->state == TCP_CLOSING)
988 { 989
990 if (!sk->dead)
991 sk->state_change(sk);
992 if (sk->rcv_ack_seq == sk->write_seq)
993 { 994 flag |= 1;
995 tcp_time_wait(sk);
996 } 997 } 998
999 /*1000 * Final ack of a three way shake 1001 */1002
1003 if(sk->state==TCP_SYN_RECV)
1004 {1005 tcp_set_state(sk, TCP_ESTABLISHED);
1006 tcp_options(sk,th);
1007 sk->dummy_th.dest=th->source;
1008 sk->copied_seq = sk->acked_seq;
1009 if(!sk->dead)
1010 sk->state_change(sk);
1011 if(sk->max_window==0)
1012 {1013 sk->max_window=32; /* Sanity check */1014 sk->mss=min(sk->max_window,sk->mtu);
1015 }1016 }1017
1018 /*1019 * I make no guarantees about the first clause in the following1020 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under1021 * what conditions "!flag" would be true. However I think the rest1022 * of the conditions would prevent that from causing any1023 * unnecessary retransmission. 1024 * Clearly if the first packet has expired it should be 1025 * retransmitted. The other alternative, "flag&2 && retransmits", is1026 * harder to explain: You have to look carefully at how and when the1027 * timer is set and with what timeout. The most recent transmission always1028 * sets the timer. So in general if the most recent thing has timed1029 * out, everything before it has as well. So we want to go ahead and1030 * retransmit some more. If we didn't explicitly test for this1031 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"1032 * would not be true. If you look at the pattern of timing, you can1033 * show that rto is increased fast enough that the next packet would1034 * almost never be retransmitted immediately. Then you'd end up1035 * waiting for a timeout to send each packet on the retransmission1036 * queue. With my implementation of the Karn sampling algorithm,1037 * the timeout would double each time. The net result is that it would1038 * take a hideous amount of time to recover from a single dropped packet.1039 * It's possible that there should also be a test for TIME_WRITE, but1040 * I think as long as "send_head != NULL" and "retransmit" is on, we've1041 * got to be in real retransmission mode.1042 * Note that tcp_do_retransmit is called with all==1. Setting cong_window1043 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.1044 * As long as no further losses occur, this seems reasonable.1045 */1046
1047 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1048 (((flag&2) && sk->retransmits) ||
1049 (flag&8) ||
1050 (sk->send_head->when + sk->rto < jiffies)))
1051 {1052 if(sk->send_head->when + sk->rto < jiffies)
1053 tcp_retransmit(sk,0);
1054 else1055 {1056 tcp_do_retransmit(sk, 1);
1057 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1058 }1059 }1060
1061 return 1;
1062
1063 uninteresting_ack:
1064 if(sk->debug)
1065 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1066
1067 /*1068 * Keepalive processing.1069 */1070
1071 if (after(ack, sk->sent_seq))
1072 {1073 return 0;
1074 }1075
1076 /*1077 * Restart the keepalive timer.1078 */1079
1080 if (sk->keepopen)
1081 {1082 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1083 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1084 }1085 return 1;
1086 }1087
1088
1089 /*1090 * Process the FIN bit. This now behaves as it is supposed to work1091 * and the FIN takes effect when it is validly part of sequence1092 * space. Not before when we get holes.1093 *1094 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT1095 * (and thence onto LAST-ACK and finally, CLOSE, we never enter1096 * TIME-WAIT)1097 *1098 * If we are in FINWAIT-1, a received FIN indicates simultaneous1099 * close and we go into CLOSING (and later onto TIME-WAIT)1100 *1101 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.1102 *1103 */1104
1105 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */1106 {1107 sk->fin_seq = skb->end_seq;
1108
1109 if (!sk->dead)
1110 {1111 sk->state_change(sk);
1112 sock_wake_async(sk->socket, 1);
1113 }1114
1115 switch(sk->state)
1116 {1117 caseTCP_SYN_RECV:
1118 caseTCP_SYN_SENT:
1119 caseTCP_ESTABLISHED:
1120 /*1121 * move to CLOSE_WAIT, tcp_data() already handled1122 * sending the ack.1123 */1124 tcp_set_state(sk,TCP_CLOSE_WAIT);
1125 if (th->rst)
1126 sk->shutdown = SHUTDOWN_MASK;
1127 break;
1128
1129 caseTCP_CLOSE_WAIT:
1130 caseTCP_CLOSING:
1131 /*1132 * received a retransmission of the FIN, do1133 * nothing.1134 */1135 break;
1136 caseTCP_TIME_WAIT:
1137 /*1138 * received a retransmission of the FIN,1139 * restart the TIME_WAIT timer.1140 */1141 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1142 return(0);
1143 caseTCP_FIN_WAIT1:
1144 /*1145 * This case occurs when a simultaneous close1146 * happens, we must ack the received FIN and1147 * enter the CLOSING state.1148 *1149 * This causes a WRITE timeout, which will either1150 * move on to TIME_WAIT when we timeout, or resend1151 * the FIN properly (maybe we get rid of that annoying1152 * FIN lost hang). The TIME_WRITE code is already correct1153 * for handling this timeout.1154 */1155
1156 if(sk->ip_xmit_timeout != TIME_WRITE)
1157 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1158 tcp_set_state(sk,TCP_CLOSING);
1159 break;
1160 caseTCP_FIN_WAIT2:
1161 /*1162 * received a FIN -- send ACK and enter TIME_WAIT1163 */1164 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1165 sk->shutdown|=SHUTDOWN_MASK;
1166 tcp_set_state(sk,TCP_TIME_WAIT);
1167 break;
1168 caseTCP_CLOSE:
1169 /*1170 * already in CLOSE1171 */1172 break;
1173 default:
1174 tcp_set_state(sk,TCP_LAST_ACK);
1175
1176 /* Start the timers. */1177 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1178 return(0);
1179 }1180
1181 return(0);
1182 }1183
1184 /*1185 * Add a sk_buff to the TCP receive queue, calculating1186 * the ACK sequence as we go..1187 */1188 staticinlinevoidtcp_insert_skb(structsk_buff * skb, structsk_buff_head * list)
/* */1189 {1190 structsk_buff * prev, * next;
1191 u32seq;
1192
1193 /*1194 * Find where the new skb goes.. (This goes backwards,1195 * on the assumption that we get the packets in order)1196 */1197 seq = skb->seq;
1198 prev = list->prev;
1199 next = (structsk_buff *) list;
1200 for (;;) {1201 if (prev == (structsk_buff *) list || !after(prev->seq, seq))
1202 break;
1203 next = prev;
1204 prev = prev->prev;
1205 }1206 __skb_insert(skb, prev, next, list);
1207 }1208
1209 /*1210 * Called for each packet when we find a new ACK endpoint sequence in it1211 */1212 staticinlineu32tcp_queue_ack(structsk_buff * skb, structsock * sk)
/* */1213 {1214 /*1215 * When we ack the fin, we do the FIN 1216 * processing.1217 */1218 skb->acked = 1;
1219 if (skb->h.th->fin)
1220 tcp_fin(skb,sk,skb->h.th);
1221 returnskb->end_seq;
1222 }1223
1224 staticvoidtcp_queue(structsk_buff * skb, structsock * sk, structtcphdr *th)
/* */1225 {1226 u32ack_seq;
1227
1228 tcp_insert_skb(skb, &sk->receive_queue);
1229
1230 /*1231 * Did we get anything new to ack?1232 */1233 ack_seq = sk->acked_seq;
1234
1235
1236 if (!after(skb->seq, ack_seq)) {1237 if (after(skb->end_seq, ack_seq)) {1238 /* the packet straddles our window end */1239 structsk_buff_head * list = &sk->receive_queue;
1240 structsk_buff * next;
1241 ack_seq = tcp_queue_ack(skb, sk);
1242
1243 /*1244 * Do we have any old packets to ack that the above1245 * made visible? (Go forward from skb)1246 */1247 next = skb->next;
1248 while (next != (structsk_buff *) list) {1249 if (after(next->seq, ack_seq))
1250 break;
1251 if (after(next->end_seq, ack_seq))
1252 ack_seq = tcp_queue_ack(next, sk);
1253 next = next->next;
1254 }1255
1256 /*1257 * Ok, we found new data, update acked_seq as1258 * necessary (and possibly send the actual1259 * ACK packet).1260 */1261 sk->acked_seq = ack_seq;
1262
1263 }else{1264 if (sk->debug)
1265 printk("Ack duplicate packet.\n");
1266 tcp_send_ack(sk);
1267 return;
1268 }1269
1270
1271 /*1272 * Delay the ack if possible. Send ack's to1273 * fin frames immediately as there shouldn't be1274 * anything more to come.1275 */1276 if (!sk->delay_acks || th->fin) {1277 tcp_send_ack(sk);
1278 }else{1279 /*1280 * If psh is set we assume it's an1281 * interactive session that wants quick1282 * acks to avoid nagling too much. 1283 */1284 intdelay = HZ/2;
1285 if (th->psh)
1286 delay = HZ/50;
1287 tcp_send_delayed_ack(sk, delay);
1288 }1289
1290 /*1291 * Tell the user we have some more data.1292 */1293
1294 if (!sk->dead)
1295 sk->data_ready(sk,0);
1296
1297 }1298 else1299 {1300 /*1301 * If we've missed a packet, send an ack.1302 * Also start a timer to send another.1303 *1304 * 4.3reno machines look for these kind of acks so1305 * they can do fast recovery. Three identical 'old'1306 * acks lets it know that one frame has been lost1307 * and should be resent. Because this is before the1308 * whole window of data has timed out it can take1309 * one lost frame per window without stalling.1310 * [See Jacobson RFC1323, Stevens TCP/IP illus vol2]1311 *1312 * We also should be spotting triple bad sequences.1313 * [We now do this.]1314 *1315 */1316
1317 if (!skb->acked)
1318 {1319 if(sk->debug)
1320 printk("Ack past end of seq packet.\n");
1321 tcp_send_ack(sk);
1322 tcp_send_delayed_ack(sk,HZ/2);
1323 }1324 }1325 }1326
1327
1328 /*1329 * This routine handles the data. If there is room in the buffer,1330 * it will be have already been moved into it. If there is no1331 * room, then we will just have to discard the packet.1332 */1333
1334 staticinttcp_data(structsk_buff *skb, structsock *sk,
/* */1335 unsignedlongsaddr, unsignedintlen)
1336 {1337 structtcphdr *th;
1338 u32new_seq, shut_seq;
1339
1340 th = skb->h.th;
1341 skb_pull(skb,th->doff*4);
1342 skb_trim(skb,len-(th->doff*4));
1343
1344 /*1345 * The bytes in the receive read/assembly queue has increased. Needed for the1346 * low memory discard algorithm 1347 */1348
1349 sk->bytes_rcv += skb->len;
1350
1351 if (skb->len == 0 && !th->fin)
1352 {1353 /* 1354 * Don't want to keep passing ack's back and forth. 1355 * (someone sent us dataless, boring frame)1356 */1357 if (!th->ack)
1358 tcp_send_ack(sk);
1359 kfree_skb(skb, FREE_READ);
1360 return(0);
1361 }1362
1363 /*1364 * We no longer have anyone receiving data on this connection.1365 */1366
1367 #ifndef TCP_DONT_RST_SHUTDOWN
1368
1369 if(sk->shutdown & RCV_SHUTDOWN)
1370 {1371 /*1372 * FIXME: BSD has some magic to avoid sending resets to1373 * broken 4.2 BSD keepalives. Much to my surprise a few non1374 * BSD stacks still have broken keepalives so we want to1375 * cope with it.1376 */1377
1378 if(skb->len) /* We don't care if it's just an ack or1379 a keepalive/window probe */1380 {1381 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */1382
1383 /* Do this the way 4.4BSD treats it. Not what I'd1384 regard as the meaning of the spec but it's what BSD1385 does and clearly they know everything 8) */1386
1387 /*1388 * This is valid because of two things1389 *1390 * a) The way tcp_data behaves at the bottom.1391 * b) A fin takes effect when read not when received.1392 */1393
1394 shut_seq = sk->acked_seq+1; /* Last byte */1395
1396 if(after(new_seq,shut_seq))
1397 {1398 if(sk->debug)
1399 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1400 sk, new_seq, shut_seq, sk->blog);
1401 if(sk->dead)
1402 {1403 sk->acked_seq = new_seq + th->fin;
1404 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1405 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1406 tcp_statistics.TcpEstabResets++;
1407 sk->err = EPIPE;
1408 sk->error_report(sk);
1409 sk->shutdown = SHUTDOWN_MASK;
1410 tcp_set_state(sk,TCP_CLOSE);
1411 kfree_skb(skb, FREE_READ);
1412 return 0;
1413 }1414 }1415 }1416 }1417
1418 #endif1419
1420 tcp_queue(skb, sk, th);
1421
1422 return(0);
1423 }1424
1425
1426 /*1427 * This routine is only called when we have urgent data1428 * signalled. Its the 'slow' part of tcp_urg. It could be1429 * moved inline now as tcp_urg is only called from one1430 * place. We handle URGent data wrong. We have to - as1431 * BSD still doesn't use the correction from RFC961.1432 */1433
1434 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */1435 {1436 u32ptr = ntohs(th->urg_ptr);
1437
1438 if (ptr)
1439 ptr--;
1440 ptr += ntohl(th->seq);
1441
1442 /* ignore urgent data that we've already seen and read */1443 if (after(sk->copied_seq, ptr))
1444 return;
1445
1446 /* do we already have a newer (or duplicate) urgent pointer? */1447 if (sk->urg_data && !after(ptr, sk->urg_seq))
1448 return;
1449
1450 /* tell the world about our new urgent pointer */1451 if (sk->proc != 0) {1452 if (sk->proc > 0) {1453 kill_proc(sk->proc, SIGURG, 1);
1454 }else{1455 kill_pg(-sk->proc, SIGURG, 1);
1456 }1457 }1458 sk->urg_data = URG_NOTYET;
1459 sk->urg_seq = ptr;
1460 }1461
1462 /*1463 * This is the 'fast' part of urgent handling.1464 */1465
1466 staticinlinevoidtcp_urg(structsock *sk, structtcphdr *th, unsignedlonglen)
/* */1467 {1468 /*1469 * Check if we get a new urgent pointer - normally not 1470 */1471
1472 if (th->urg)
1473 tcp_check_urg(sk,th);
1474
1475 /*1476 * Do we wait for any urgent data? - normally not1477 */1478
1479 if (sk->urg_data == URG_NOTYET) {1480 u32ptr;
1481
1482 /*1483 * Is the urgent pointer pointing into this packet? 1484 */1485 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1486 if (ptr < len) {1487 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
1488 if (!sk->dead)
1489 sk->data_ready(sk,0);
1490 }1491 }1492 }1493
1494 /*1495 * This should be a bit smarter and remove partially1496 * overlapping stuff too, but this should be good1497 * enough for any even remotely normal case (and the1498 * worst that can happen is that we have a few1499 * unnecessary packets in the receive queue).1500 *1501 * This function is never called with an empty list..1502 */1503 staticinlinevoidtcp_remove_dups(structsk_buff_head * list)
/* */1504 {1505 structsk_buff * next = list->next;
1506
1507 for (;;) {1508 structsk_buff * skb = next;
1509 next = next->next;
1510 if (next == (structsk_buff *) list)
1511 break;
1512 if (before(next->end_seq, skb->end_seq)) {1513 __skb_unlink(next, list);
1514 kfree_skb(next, FREE_READ);
1515 next = skb;
1516 continue;
1517 }1518 if (next->seq != skb->seq)
1519 continue;
1520 __skb_unlink(skb, list);
1521 kfree_skb(skb, FREE_READ);
1522 }1523 }1524
1525 /*1526 * Throw out all unnecessary packets: we've gone over the1527 * receive queue limit. This shouldn't happen in a normal1528 * TCP connection, but we might have gotten duplicates etc.1529 */1530 staticvoidprune_queue(structsk_buff_head * list)
/* */1531 {1532 for (;;) {1533 structsk_buff * skb = list->prev;
1534
1535 /* gone through it all? */1536 if (skb == (structsk_buff *) list)
1537 break;
1538 if (!skb->acked) {1539 __skb_unlink(skb, list);
1540 kfree_skb(skb, FREE_READ);
1541 continue;
1542 }1543 tcp_remove_dups(list);
1544 break;
1545 }1546 }1547
1548 /*1549 * A TCP packet has arrived.1550 * skb->h.raw is the TCP header.1551 */1552
1553 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */1554 __u32daddr, unsignedshortlen,
1555 __u32saddr, intredo, structinet_protocol * protocol)
1556 {1557 structtcphdr *th;
1558 structsock *sk;
1559 intsyn_ok=0;
1560
1561 /*1562 * "redo" is 1 if we have already seen this skb but couldn't1563 * use it at that time (the socket was locked). In that case1564 * we have already done a lot of the work (looked up the socket1565 * etc).1566 */1567 th = skb->h.th;
1568 sk = skb->sk;
1569 if (!redo) {1570 tcp_statistics.TcpInSegs++;
1571 if (skb->pkt_type!=PACKET_HOST)
1572 gotodiscard_it;
1573
1574 /*1575 * Pull up the IP header.1576 */1577
1578 skb_pull(skb, skb->h.raw-skb->data);
1579
1580 /*1581 * Try to use the device checksum if provided.1582 */1583 switch (skb->ip_summed)
1584 {1585 caseCHECKSUM_NONE:
1586 skb->csum = csum_partial((char *)th, len, 0);
1587 caseCHECKSUM_HW:
1588 if (tcp_check(th, len, saddr, daddr, skb->csum))
1589 gotodiscard_it;
1590 default:
1591 /* CHECKSUM_UNNECESSARY */1592 }1593 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1594 if (!sk)
1595 gotono_tcp_socket;
1596 skb->sk = sk;
1597 skb->seq = ntohl(th->seq);
1598 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1599 skb->ack_seq = ntohl(th->ack_seq);
1600
1601 skb->acked = 0;
1602 skb->used = 0;
1603 skb->free = 1;
1604 skb->saddr = daddr;
1605 skb->daddr = saddr;
1606
1607 /*1608 * We may need to add it to the backlog here. 1609 */1610 if (sk->users)
1611 {1612 __skb_queue_tail(&sk->back_log, skb);
1613 return(0);
1614 }1615 }1616
1617 /*1618 * If this socket has got a reset it's to all intents and purposes 1619 * really dead. Count closed sockets as dead.1620 *1621 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD1622 * simply drops data. This seems incorrect as a 'closed' TCP doesn't1623 * exist so should cause resets as if the port was unreachable.1624 */1625
1626 if (sk->zapped || sk->state==TCP_CLOSE)
1627 gotono_tcp_socket;
1628
1629 if (!sk->prot)
1630 {1631 printk("IMPOSSIBLE 3\n");
1632 return(0);
1633 }1634
1635
1636 /*1637 * Charge the memory to the socket. 1638 */1639
1640 skb->sk=sk;
1641 atomic_add(skb->truesize, &sk->rmem_alloc);
1642
1643 /*1644 * We should now do header prediction.1645 */1646
1647 /*1648 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We1649 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug1650 * compatibility. We also set up variables more thoroughly [Karn notes in the1651 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].1652 */1653
1654 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */1655 {1656
1657 /*1658 * Now deal with unusual cases.1659 */1660
1661 if(sk->state==TCP_LISTEN)
1662 {1663 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */1664 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1665
1666 /*1667 * We don't care for RST, and non SYN are absorbed (old segments)1668 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the1669 * netmask on a running connection it can go broadcast. Even Sun's have1670 * this problem so I'm ignoring it 1671 */1672
1673 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1674 {1675 kfree_skb(skb, FREE_READ);
1676 return 0;
1677 }1678
1679 /* 1680 * Guess we need to make a new socket up 1681 */1682
1683 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1684
1685 /*1686 * Now we have several options: In theory there is nothing else1687 * in the frame. KA9Q has an option to send data with the syn,1688 * BSD accepts data with the syn up to the [to be] advertised window1689 * and Solaris 2.1 gives you a protocol error. For now we just ignore1690 * it, that fits the spec precisely and avoids incompatibilities. It1691 * would be nice in future to drop through and process the data.1692 *1693 * Now TTCP is starting to use we ought to queue this data.1694 */1695
1696 return 0;
1697 }1698
1699 /* 1700 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN1701 * then its a new connection1702 */1703
1704 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1705 {1706 kfree_skb(skb, FREE_READ);
1707 return 0;
1708 }1709
1710 /*1711 * SYN sent means we have to look for a suitable ack and either reset1712 * for bad matches or go to connected. The SYN_SENT case is unusual and should1713 * not be in line code. [AC]1714 */1715
1716 if(sk->state==TCP_SYN_SENT)
1717 {1718 /* Crossed SYN or previous junk segment */1719 if(th->ack)
1720 {1721 /* We got an ack, but it's not a good ack */1722 if(!tcp_ack(sk,th,skb->ack_seq,len))
1723 {1724 /* Reset the ack - its an ack from a 1725 different connection [ th->rst is checked in tcp_send_reset()] */1726 tcp_statistics.TcpAttemptFails++;
1727 tcp_send_reset(daddr, saddr, th,
1728 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1729 kfree_skb(skb, FREE_READ);
1730 return(0);
1731 }1732 if(th->rst)
1733 returntcp_reset(sk,skb);
1734 if(!th->syn)
1735 {1736 /* A valid ack from a different connection1737 start. Shouldn't happen but cover it */1738 tcp_statistics.TcpAttemptFails++;
1739 tcp_send_reset(daddr, saddr, th,
1740 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1741 kfree_skb(skb, FREE_READ);
1742 return 0;
1743 }1744 /*1745 * Ok.. it's good. Set up sequence numbers and1746 * move to established.1747 */1748 syn_ok=1; /* Don't reset this connection for the syn */1749 sk->acked_seq = skb->seq+1;
1750 sk->lastwin_seq = skb->seq+1;
1751 sk->fin_seq = skb->seq;
1752 tcp_send_ack(sk);
1753 tcp_set_state(sk, TCP_ESTABLISHED);
1754 tcp_options(sk,th);
1755 sk->dummy_th.dest=th->source;
1756 sk->copied_seq = sk->acked_seq;
1757 if(!sk->dead)
1758 {1759 sk->state_change(sk);
1760 sock_wake_async(sk->socket, 0);
1761 }1762 if(sk->max_window==0)
1763 {1764 sk->max_window = 32;
1765 sk->mss = min(sk->max_window, sk->mtu);
1766 }1767 }1768 else1769 {1770 /* See if SYN's cross. Drop if boring */1771 if(th->syn && !th->rst)
1772 {1773 /* Crossed SYN's are fine - but talking to1774 yourself is right out... */1775 if(sk->saddr==saddr && sk->daddr==daddr &&
1776 sk->dummy_th.source==th->source &&
1777 sk->dummy_th.dest==th->dest)
1778 {1779 tcp_statistics.TcpAttemptFails++;
1780 returntcp_reset(sk,skb);
1781 }1782 tcp_set_state(sk,TCP_SYN_RECV);
1783
1784 /*1785 * FIXME:1786 * Must send SYN|ACK here1787 */1788 }1789 /* Discard junk segment */1790 kfree_skb(skb, FREE_READ);
1791 return 0;
1792 }1793 /*1794 * SYN_RECV with data maybe.. drop through1795 */1796 gotorfc_step6;
1797 }1798
1799 /*1800 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is1801 * a more complex suggestion for fixing these reuse issues in RFC16441802 * but not yet ready for general use. Also see RFC1379.1803 *1804 * Note the funny way we go back to the top of this function for1805 * this case ("goto try_next_socket"). That also takes care of1806 * checking "sk->users" for the new socket as well as doing all1807 * the normal tests on the packet.1808 */1809
1810 #defineBSD_TIME_WAIT1811 #ifdefBSD_TIME_WAIT1812 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1813 after(skb->seq, sk->acked_seq) && !th->rst)
1814 {1815 u32seq = sk->write_seq;
1816 if(sk->debug)
1817 printk("Doing a BSD time wait\n");
1818 tcp_statistics.TcpEstabResets++;
1819 atomic_sub(skb->truesize, &sk->rmem_alloc);
1820 skb->sk = NULL;
1821 sk->err=ECONNRESET;
1822 tcp_set_state(sk, TCP_CLOSE);
1823 sk->shutdown = SHUTDOWN_MASK;
1824 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1825 /* this is not really correct: we should check sk->users */1826 if (sk && sk->state==TCP_LISTEN)
1827 {1828 skb->sk = sk;
1829 atomic_add(skb->truesize, &sk->rmem_alloc);
1830 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1831 return 0;
1832 }1833 kfree_skb(skb, FREE_READ);
1834 return 0;
1835 }1836 #endif1837 }1838
1839 /*1840 * We are now in normal data flow (see the step list in the RFC)1841 * Note most of these are inline now. I'll inline the lot when1842 * I have time to test it hard and look at what gcc outputs 1843 */1844
1845 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1846 {1847 bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
1848 kfree_skb(skb, FREE_READ);
1849 return 0;
1850 }1851
1852 if(th->rst)
1853 returntcp_reset(sk,skb);
1854
1855 /*1856 * !syn_ok is effectively the state test in RFC793.1857 */1858
1859 if(th->syn && !syn_ok)
1860 {1861 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1862 returntcp_reset(sk,skb);
1863 }1864
1865 tcp_delack_estimator(sk);
1866
1867 /*1868 * Process the ACK1869 */1870
1871
1872 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1873 {1874 /*1875 * Our three way handshake failed.1876 */1877
1878 if(sk->state==TCP_SYN_RECV)
1879 {1880 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1881 }1882 kfree_skb(skb, FREE_READ);
1883 return 0;
1884 }1885
1886 rfc_step6: /* I'll clean this up later */1887
1888 /*1889 * If the accepted buffer put us over our queue size we1890 * now drop it (we must process the ack first to avoid1891 * deadlock cases).1892 */1893
1894 /*1895 * Process urgent data1896 */1897
1898 tcp_urg(sk, th, len);
1899
1900 /*1901 * Process the encapsulated data1902 */1903
1904 if(tcp_data(skb,sk, saddr, len))
1905 kfree_skb(skb, FREE_READ);
1906
1907 /*1908 * If our receive queue has grown past its limits,1909 * try to prune away duplicates etc..1910 */1911 if (sk->rmem_alloc > sk->rcvbuf)
1912 prune_queue(&sk->receive_queue);
1913
1914 /*1915 * And done1916 */1917
1918 return 0;
1919
1920 no_tcp_socket:
1921 /*1922 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)1923 */1924 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1925
1926 discard_it:
1927 /*1928 * Discard frame1929 */1930 skb->sk = NULL;
1931 kfree_skb(skb, FREE_READ);
1932 return 0;
1933 }