1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp_input.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 */ 22
23 #include <linux/config.h>
24 #include <net/tcp.h>
25
26 /* 27 * Policy code extracted so its now seperate 28 */ 29
30 /* 31 * Called each time to estimate the delayed ack timeout. This is 32 * how it should be done so a fast link isnt impacted by ack delay. 33 */ 34
35 extern__inline__voidtcp_delack_estimator(structsock *sk)
/* */ 36 { 37 /* 38 * Delayed ACK time estimator. 39 */ 40
41 if (sk->lrcvtime == 0)
42 { 43 sk->lrcvtime = jiffies;
44 sk->ato = HZ/3;
45 } 46 else 47 { 48 intm;
49
50 m = jiffies - sk->lrcvtime;
51
52 sk->lrcvtime = jiffies;
53
54 if (m <= 0)
55 m = 1;
56
57 if (m > (sk->rtt >> 3))
58 { 59 sk->ato = sk->rtt >> 3;
60 /* 61 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); 62 */ 63 } 64 else 65 { 66 sk->ato = (sk->ato >> 1) + m;
67 /* 68 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); 69 */ 70 } 71 } 72 } 73
74 /* 75 * Called on frames that were known _not_ to have been 76 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 77 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. 78 */ 79
80 extern__inline__voidtcp_rtt_estimator(structsock *sk, structsk_buff *oskb)
/* */ 81 { 82 longm;
83 /* 84 * The following amusing code comes from Jacobson's 85 * article in SIGCOMM '88. Note that rtt and mdev 86 * are scaled versions of rtt and mean deviation. 87 * This is designed to be as fast as possible 88 * m stands for "measurement". 89 */ 90
91 m = jiffies - oskb->when; /* RTT */ 92 if(m<=0)
93 m=1; /* IS THIS RIGHT FOR <0 ??? */ 94 m -= (sk->rtt >> 3); /* m is now error in rtt est */ 95 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ 96 if (m < 0)
97 m = -m; /* m is now abs(error) */ 98 m -= (sk->mdev >> 2); /* similar update on mdev */ 99 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 100
101 /* 102 * Now update timeout. Note that this removes any backoff. 103 */ 104
105 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
106 if (sk->rto > 120*HZ)
107 sk->rto = 120*HZ;
108 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ 109 sk->rto = HZ/5;
110 sk->backoff = 0;
111 } 112
113 /* 114 * Cached last hit socket 115 */ 116
117 staticvolatileunsignedlongth_cache_saddr, th_cache_daddr;
118 staticvolatileunsignedshortth_cache_dport, th_cache_sport;
119 staticvolatilestructsock *th_cache_sk;
120
121 voidtcp_cache_zap(void)
/* */ 122 { 123 th_cache_sk=NULL;
124 } 125
126 /* 127 * Find the socket, using the last hit cache if applicable. The cache is not quite 128 * right... 129 */ 130
131 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */ 132 { 133 structsock * sk;
134
135 sk = (structsock *) th_cache_sk;
136 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
137 sport != th_cache_sport || dport != th_cache_dport) { 138 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
139 if (sk) { 140 th_cache_saddr=saddr;
141 th_cache_daddr=daddr;
142 th_cache_dport=dport;
143 th_cache_sport=sport;
144 th_cache_sk=sk;
145 } 146 } 147 returnsk;
148 } 149
150 /* 151 * React to a out-of-window TCP sequence number in an incoming packet 152 */ 153
154 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */ 155 structoptions *opt, unsignedlongsaddr, structdevice *dev)
156 { 157 if (th->rst)
158 return;
159
160 /* 161 * Send a reset if we get something not ours and we are 162 * unsynchronized. Note: We don't do anything to our end. We 163 * are just killing the bogus remote connection then we will 164 * connect again and it will work (with luck). 165 */ 166
167 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
168 { 169 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
170 return;
171 } 172
173 /* 174 * 4.3reno machines look for these kind of acks so they can do fast 175 * recovery. Three identical 'old' acks lets it know that one frame has 176 * been lost and should be resent. Because this is before the whole window 177 * of data has timed out it can take one lost frame per window without 178 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2] 179 * 180 * We also should be spotting triple bad sequences. 181 */ 182 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
183 return;
184 } 185
186 /* 187 * This functions checks to see if the tcp header is actually acceptable. 188 */ 189
190 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */ 191 { 192 u32end_window = sk->acked_seq + sk->window;
193 return/* if start is at end of window, end must be too (zero window) */ 194 (seq == end_window && seq == end_seq) ||
195 /* if start is before end of window, check for interest */ 196 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
197 } 198
199 /* 200 * When we get a reset we do this. This probably is a tcp_output routine 201 * really. 202 */ 203
204 staticinttcp_reset(structsock *sk, structsk_buff *skb)
/* */ 205 { 206 sk->zapped = 1;
207 /* 208 * We want the right error as BSD sees it (and indeed as we do). 209 */ 210 sk->err = ECONNRESET;
211 if (sk->state == TCP_SYN_SENT)
212 sk->err = ECONNREFUSED;
213 if (sk->state == TCP_CLOSE_WAIT)
214 sk->err = EPIPE;
215 #ifdef CONFIG_TCP_RFC1337
216 /* 217 * Time wait assassination protection [RFC1337] 218 * 219 * This is a good idea, but causes more sockets to take time to close. 220 * 221 * Ian Heavens has since shown this is an inadequate fix for the protocol 222 * bug in question. 223 */ 224 if(sk->state!=TCP_TIME_WAIT)
225 { 226 tcp_set_state(sk,TCP_CLOSE);
227 sk->shutdown = SHUTDOWN_MASK;
228 } 229 #else 230 tcp_set_state(sk,TCP_CLOSE);
231 sk->shutdown = SHUTDOWN_MASK;
232 #endif 233 if (!sk->dead)
234 sk->state_change(sk);
235 kfree_skb(skb, FREE_READ);
236 return(0);
237 } 238
239
240 /* 241 * Look for tcp options. Parses everything but only knows about MSS. 242 * This routine is always called with the packet containing the SYN. 243 * However it may also be called with the ack to the SYN. So you 244 * can't assume this is always the SYN. It's always called after 245 * we have set up sk->mtu to our own MTU. 246 * 247 * We need at minimum to add PAWS support here. Possibly large windows 248 * as Linux gets deployed on 100Mb/sec networks. 249 */ 250
251 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */ 252 { 253 unsignedchar *ptr;
254 intlength=(th->doff*4)-sizeof(structtcphdr);
255 intmss_seen = 0;
256
257 ptr = (unsignedchar *)(th + 1);
258
259 while(length>0)
260 { 261 intopcode=*ptr++;
262 intopsize=*ptr++;
263 switch(opcode)
264 { 265 caseTCPOPT_EOL:
266 return;
267 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 268 length--;
269 ptr--; /* the opsize=*ptr++ above was a mistake */ 270 continue;
271
272 default:
273 if(opsize<=2) /* Avoid silly options looping forever */ 274 return;
275 switch(opcode)
276 { 277 caseTCPOPT_MSS:
278 if(opsize==4 && th->syn)
279 { 280 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
281 mss_seen = 1;
282 } 283 break;
284 /* Add other options here as people feel the urge to implement stuff like large windows */ 285 } 286 ptr+=opsize-2;
287 length-=opsize;
288 } 289 } 290 if (th->syn)
291 { 292 if (! mss_seen)
293 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ 294 } 295 #ifdefCONFIG_INET_PCTCP 296 sk->mss = min(sk->max_window >> 1, sk->mtu);
297 #else 298 sk->mss = min(sk->max_window, sk->mtu);
299 sk->max_unacked = 2 * sk->mss;
300 #endif 301 } 302
303
304 /* 305 * This routine handles a connection request. 306 * It should make sure we haven't already responded. 307 * Because of the way BSD works, we have to send a syn/ack now. 308 * This also means it will be harder to close a socket which is 309 * listening. 310 */ 311
312 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */ 313 u32daddr, u32saddr, structoptions *opt, structdevice *dev, u32seq)
314 { 315 structsock *newsk;
316 structtcphdr *th;
317 structrtable *rt;
318
319 th = skb->h.th;
320
321 /* If the socket is dead, don't accept the connection. */ 322 if (!sk->dead)
323 { 324 sk->data_ready(sk,0);
325 } 326 else 327 { 328 if(sk->debug)
329 printk("Reset on %p: Connect on dead socket.\n",sk);
330 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
331 tcp_statistics.TcpAttemptFails++;
332 kfree_skb(skb, FREE_READ);
333 return;
334 } 335
336 /* 337 * Make sure we can accept more. This will prevent a 338 * flurry of syns from eating up all our memory. 339 * 340 * BSD does some funnies here and allows 3/2 times the 341 * set backlog as a fudge factor. Thats just too gross. 342 */ 343
344 if (sk->ack_backlog >= sk->max_ack_backlog)
345 { 346 tcp_statistics.TcpAttemptFails++;
347 kfree_skb(skb, FREE_READ);
348 return;
349 } 350
351 /* 352 * We need to build a new sock struct. 353 * It is sort of bad to have a socket without an inode attached 354 * to it, but the wake_up's will just wake up the listening socket, 355 * and if the listening socket is destroyed before this is taken 356 * off of the queue, this will take care of it. 357 */ 358
359 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
360 if (newsk == NULL)
361 { 362 /* just ignore the syn. It will get retransmitted. */ 363 tcp_statistics.TcpAttemptFails++;
364 kfree_skb(skb, FREE_READ);
365 return;
366 } 367
368 memcpy(newsk, sk, sizeof(*newsk));
369 newsk->opt = NULL;
370 newsk->ip_route_cache = NULL;
371 if (opt && opt->optlen)
372 { 373 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
374 if (!sk->opt)
375 { 376 kfree_s(newsk, sizeof(structsock));
377 tcp_statistics.TcpAttemptFails++;
378 kfree_skb(skb, FREE_READ);
379 return;
380 } 381 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
382 { 383 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
384 kfree_s(newsk, sizeof(structsock));
385 tcp_statistics.TcpAttemptFails++;
386 kfree_skb(skb, FREE_READ);
387 return;
388 } 389 } 390 skb_queue_head_init(&newsk->write_queue);
391 skb_queue_head_init(&newsk->receive_queue);
392 newsk->send_head = NULL;
393 newsk->send_tail = NULL;
394 skb_queue_head_init(&newsk->back_log);
395 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ 396 newsk->rto = TCP_TIMEOUT_INIT;
397 newsk->mdev = 0;
398 newsk->max_window = 0;
399 newsk->cong_window = 1;
400 newsk->cong_count = 0;
401 newsk->ssthresh = 0;
402 newsk->backoff = 0;
403 newsk->blog = 0;
404 newsk->intr = 0;
405 newsk->proc = 0;
406 newsk->done = 0;
407 newsk->partial = NULL;
408 newsk->pair = NULL;
409 newsk->wmem_alloc = 0;
410 newsk->rmem_alloc = 0;
411 newsk->localroute = sk->localroute;
412
413 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
414
415 newsk->err = 0;
416 newsk->shutdown = 0;
417 newsk->ack_backlog = 0;
418 newsk->acked_seq = skb->seq+1;
419 newsk->lastwin_seq = skb->seq+1;
420 newsk->delay_acks = 1;
421 newsk->copied_seq = skb->seq+1;
422 newsk->fin_seq = skb->seq;
423 newsk->state = TCP_SYN_RECV;
424 newsk->timeout = 0;
425 newsk->ip_xmit_timeout = 0;
426 newsk->write_seq = seq;
427 newsk->window_seq = newsk->write_seq;
428 newsk->rcv_ack_seq = newsk->write_seq;
429 newsk->urg_data = 0;
430 newsk->retransmits = 0;
431 newsk->linger=0;
432 newsk->destroy = 0;
433 init_timer(&newsk->timer);
434 newsk->timer.data = (unsignedlong)newsk;
435 newsk->timer.function = &net_timer;
436 init_timer(&newsk->retransmit_timer);
437 newsk->retransmit_timer.data = (unsignedlong)newsk;
438 newsk->retransmit_timer.function=&tcp_retransmit_timer;
439 newsk->dummy_th.source = skb->h.th->dest;
440 newsk->dummy_th.dest = skb->h.th->source;
441
442 /* 443 * Swap these two, they are from our point of view. 444 */ 445
446 newsk->daddr = saddr;
447 newsk->saddr = daddr;
448 newsk->rcv_saddr = daddr;
449
450 put_sock(newsk->num,newsk);
451 newsk->acked_seq = skb->seq + 1;
452 newsk->copied_seq = skb->seq + 1;
453 newsk->socket = NULL;
454
455 /* 456 * Grab the ttl and tos values and use them 457 */ 458
459 newsk->ip_ttl=sk->ip_ttl;
460 newsk->ip_tos=skb->ip_hdr->tos;
461
462 /* 463 * Use 512 or whatever user asked for 464 */ 465
466 /* 467 * Note use of sk->user_mss, since user has no direct access to newsk 468 */ 469
470 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
471 newsk->ip_route_cache = rt;
472
473 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
474 newsk->window_clamp = rt->rt_window;
475 else 476 newsk->window_clamp = 0;
477
478 if (sk->user_mss)
479 newsk->mtu = sk->user_mss;
480 elseif (rt)
481 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
482 else 483 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
484
485 /* 486 * But not bigger than device MTU 487 */ 488
489 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
490
491 #ifdefCONFIG_SKIP 492
493 /* 494 * SKIP devices set their MTU to 65535. This is so they can take packets 495 * unfragmented to security process then fragment. They could lie to the 496 * TCP layer about a suitable MTU, but its easier to let skip sort it out 497 * simply because the final package we want unfragmented is going to be 498 * 499 * [IPHDR][IPSP][Security data][Modified TCP data][Security data] 500 */ 501
502 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ 503 sk->mtu=skip_pick_mtu(sk->mtu,dev);
504 #endif 505 /* 506 * This will min with what arrived in the packet 507 */ 508
509 tcp_options(newsk,skb->h.th);
510
511 tcp_cache_zap();
512 tcp_send_synack(newsk, sk, skb);
513 } 514
515
516 /* 517 * Handle a TCP window that shrunk on us. It shouldn't happen, 518 * but.. 519 * 520 * We may need to move packets from the send queue 521 * to the write queue, if the window has been shrunk on us. 522 * The RFC says you are not allowed to shrink your window 523 * like this, but if the other end does, you must be able 524 * to deal with it. 525 */ 526 voidtcp_window_shrunk(structsock * sk, u32window_seq)
/* */ 527 { 528 structsk_buff *skb;
529 structsk_buff *skb2;
530 structsk_buff *wskb = NULL;
531
532 skb2 = sk->send_head;
533 sk->send_head = NULL;
534 sk->send_tail = NULL;
535
536 /* 537 * This is an artifact of a flawed concept. We want one 538 * queue and a smarter send routine when we send all. 539 */ 540 cli();
541 while (skb2 != NULL)
542 { 543 skb = skb2;
544 skb2 = skb->link3;
545 skb->link3 = NULL;
546 if (after(skb->end_seq, window_seq))
547 { 548 if (sk->packets_out > 0)
549 sk->packets_out--;
550 /* We may need to remove this from the dev send list. */ 551 if (skb->next != NULL)
552 { 553 skb_unlink(skb);
554 } 555 /* Now add it to the write_queue. */ 556 if (wskb == NULL)
557 skb_queue_head(&sk->write_queue,skb);
558 else 559 skb_append(wskb,skb);
560 wskb = skb;
561 } 562 else 563 { 564 if (sk->send_head == NULL)
565 { 566 sk->send_head = skb;
567 sk->send_tail = skb;
568 } 569 else 570 { 571 sk->send_tail->link3 = skb;
572 sk->send_tail = skb;
573 } 574 skb->link3 = NULL;
575 } 576 } 577 sti();
578 } 579
580
581 /* 582 * This routine deals with incoming acks, but not outgoing ones. 583 * 584 * This routine is totally _WRONG_. The list structuring is wrong, 585 * the algorithm is wrong, the code is wrong. 586 */ 587
588 staticinttcp_ack(structsock *sk, structtcphdr *th, u32ack, intlen)
/* */ 589 { 590 intflag = 0;
591 u32window_seq;
592
593 /* 594 * 1 - there was data in packet as well as ack or new data is sent or 595 * in shutdown state 596 * 2 - data from retransmit queue was acked and removed 597 * 4 - window shrunk or data from retransmit queue was acked and removed 598 */ 599
600 if(sk->zapped)
601 return(1); /* Dead, cant ack any more so why bother */ 602
603 /* 604 * We have dropped back to keepalive timeouts. Thus we have 605 * no retransmits pending. 606 */ 607
608 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
609 sk->retransmits = 0;
610
611 /* 612 * If the ack is newer than sent or older than previous acks 613 * then we can probably ignore it. 614 */ 615
616 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
617 gotouninteresting_ack;
618
619 /* 620 * If there is data set flag 1 621 */ 622
623 if (len != th->doff*4)
624 flag |= 1;
625
626 /* 627 * Have we discovered a larger window 628 */ 629 window_seq = ntohs(th->window);
630 if (window_seq > sk->max_window)
631 { 632 sk->max_window = window_seq;
633 #ifdefCONFIG_INET_PCTCP 634 /* Hack because we don't send partial packets to non SWS 635 handling hosts */ 636 sk->mss = min(window_seq>>1, sk->mtu);
637 #else 638 sk->mss = min(window_seq, sk->mtu);
639 #endif 640 } 641 window_seq += ack;
642
643 /* 644 * See if our window has been shrunk. 645 */ 646 if (after(sk->window_seq, window_seq)) { 647 flag |= 4;
648 tcp_window_shrunk(sk, window_seq);
649 } 650
651 /* 652 * Update the right hand window edge of the host 653 */ 654 sk->window_seq = window_seq;
655
656 /* 657 * Pipe has emptied 658 */ 659 if (sk->send_tail == NULL || sk->send_head == NULL)
660 { 661 sk->send_head = NULL;
662 sk->send_tail = NULL;
663 sk->packets_out= 0;
664 } 665
666 /* 667 * We don't want too many packets out there. 668 */ 669
670 if (sk->ip_xmit_timeout == TIME_WRITE &&
671 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
672 { 673
674 /* 675 * This is Jacobson's slow start and congestion avoidance. 676 * SIGCOMM '88, p. 328. Because we keep cong_window in integral 677 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 678 * counter and increment it once every cwnd times. It's possible 679 * that this should be done only if sk->retransmits == 0. I'm 680 * interpreting "new data is acked" as including data that has 681 * been retransmitted but is just now being acked. 682 */ 683 if (sk->cong_window < sk->ssthresh)
684 /* 685 * In "safe" area, increase 686 */ 687 sk->cong_window++;
688 else 689 { 690 /* 691 * In dangerous area, increase slowly. In theory this is 692 * sk->cong_window += 1 / sk->cong_window 693 */ 694 if (sk->cong_count >= sk->cong_window)
695 { 696 sk->cong_window++;
697 sk->cong_count = 0;
698 } 699 else 700 sk->cong_count++;
701 } 702 } 703
704 /* 705 * Remember the highest ack received. 706 */ 707
708 sk->rcv_ack_seq = ack;
709
710 /* 711 * We passed data and got it acked, remove any soft error 712 * log. Something worked... 713 */ 714
715 sk->err_soft = 0;
716
717 /* 718 * If this ack opens up a zero window, clear backoff. It was 719 * being used to time the probes, and is probably far higher than 720 * it needs to be for normal retransmission. 721 */ 722
723 if (sk->ip_xmit_timeout == TIME_PROBE0)
724 { 725 sk->retransmits = 0; /* Our probe was answered */ 726
727 /* 728 * Was it a usable window open ? 729 */ 730
731 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */ 732 ! before (sk->window_seq, sk->write_queue.next->end_seq))
733 { 734 sk->backoff = 0;
735
736 /* 737 * Recompute rto from rtt. this eliminates any backoff. 738 */ 739
740 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
741 if (sk->rto > 120*HZ)
742 sk->rto = 120*HZ;
743 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about 744 .2 of a second because of BSD delayed acks - on a 100Mb/sec link 745 .2 of a second is going to need huge windows (SIGH) */ 746 sk->rto = HZ/5;
747 } 748 } 749
750 /* 751 * See if we can take anything off of the retransmit queue. 752 */ 753
754 while(sk->send_head != NULL)
755 { 756 /* Check for a bug. */ 757 if (sk->send_head->link3 &&
758 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
759 printk("INET: tcp.c: *** bug send_list out of order.\n");
760
761 /* 762 * If our packet is before the ack sequence we can 763 * discard it as it's confirmed to have arrived the other end. 764 */ 765
766 if (before(sk->send_head->end_seq, ack+1))
767 { 768 structsk_buff *oskb;
769 if (sk->retransmits)
770 { 771 /* 772 * We were retransmitting. don't count this in RTT est 773 */ 774 flag |= 2;
775
776 /* 777 * even though we've gotten an ack, we're still 778 * retransmitting as long as we're sending from 779 * the retransmit queue. Keeping retransmits non-zero 780 * prevents us from getting new data interspersed with 781 * retransmissions. 782 */ 783
784 if (sk->send_head->link3) /* Any more queued retransmits? */ 785 sk->retransmits = 1;
786 else 787 sk->retransmits = 0;
788 } 789 /* 790 * Note that we only reset backoff and rto in the 791 * rtt recomputation code. And that doesn't happen 792 * if there were retransmissions in effect. So the 793 * first new packet after the retransmissions is 794 * sent with the backoff still in effect. Not until 795 * we get an ack from a non-retransmitted packet do 796 * we reset the backoff and rto. This allows us to deal 797 * with a situation where the network delay has increased 798 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) 799 */ 800
801 /* 802 * We have one less packet out there. 803 */ 804
805 if (sk->packets_out > 0)
806 sk->packets_out --;
807
808 oskb = sk->send_head;
809
810 if (!(flag&2)) /* Not retransmitting */ 811 tcp_rtt_estimator(sk,oskb);
812 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 813 In this case as we just set it up */ 814 cli();
815 oskb = sk->send_head;
816 IS_SKB(oskb);
817 sk->send_head = oskb->link3;
818 if (sk->send_head == NULL)
819 { 820 sk->send_tail = NULL;
821 } 822
823 /* 824 * We may need to remove this from the dev send list. 825 */ 826
827 if (oskb->next)
828 skb_unlink(oskb);
829 sti();
830 kfree_skb(oskb, FREE_WRITE); /* write. */ 831 if (!sk->dead)
832 sk->write_space(sk);
833 } 834 else 835 { 836 break;
837 } 838 } 839
840 /* 841 * XXX someone ought to look at this too.. at the moment, if skb_peek() 842 * returns non-NULL, we complete ignore the timer stuff in the else 843 * clause. We ought to organize the code so that else clause can 844 * (should) be executed regardless, possibly moving the PROBE timer 845 * reset over. The skb_peek() thing should only move stuff to the 846 * write queue, NOT also manage the timer functions. 847 */ 848
849 /* 850 * Maybe we can take some stuff off of the write queue, 851 * and put it onto the xmit queue. 852 */ 853 if (skb_peek(&sk->write_queue) != NULL)
854 { 855 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
856 (sk->retransmits == 0 ||
857 sk->ip_xmit_timeout != TIME_WRITE ||
858 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
859 && sk->packets_out < sk->cong_window)
860 { 861 /* 862 * Add more data to the send queue. 863 */ 864 flag |= 1;
865 tcp_write_xmit(sk);
866 } 867 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
868 sk->send_head == NULL &&
869 sk->ack_backlog == 0 &&
870 sk->state != TCP_TIME_WAIT)
871 { 872 /* 873 * Data to queue but no room. 874 */ 875 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
876 } 877 } 878 else 879 { 880 /* 881 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets 882 * from TCP_CLOSE we don't do anything 883 * 884 * from anything else, if there is write data (or fin) pending, 885 * we use a TIME_WRITE timeout, else if keepalive we reset to 886 * a KEEPALIVE timeout, else we delete the timer. 887 * 888 * We do not set flag for nominal write data, otherwise we may 889 * force a state where we start to write itsy bitsy tidbits 890 * of data. 891 */ 892
893 switch(sk->state) { 894 caseTCP_TIME_WAIT:
895 /* 896 * keep us in TIME_WAIT until we stop getting packets, 897 * reset the timeout. 898 */ 899 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
900 break;
901 caseTCP_CLOSE:
902 /* 903 * don't touch the timer. 904 */ 905 break;
906 default:
907 /* 908 * Must check send_head, write_queue, and ack_backlog 909 * to determine which timeout to use. 910 */ 911 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) { 912 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
913 }elseif (sk->keepopen) { 914 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
915 }else{ 916 del_timer(&sk->retransmit_timer);
917 sk->ip_xmit_timeout = 0;
918 } 919 break;
920 } 921 } 922
923 /* 924 * We have nothing queued but space to send. Send any partial 925 * packets immediately (end of Nagle rule application). 926 */ 927
928 if (sk->packets_out == 0 && sk->partial != NULL &&
929 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
930 { 931 flag |= 1;
932 tcp_send_partial(sk);
933 } 934
935 /* 936 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and 937 * we are now waiting for an acknowledge to our FIN. The other end is 938 * already in TIME_WAIT. 939 * 940 * Move to TCP_CLOSE on success. 941 */ 942
943 if (sk->state == TCP_LAST_ACK)
944 { 945 if (!sk->dead)
946 sk->state_change(sk);
947 if(sk->debug)
948 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
949 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
950 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
951 { 952 flag |= 1;
953 sk->shutdown = SHUTDOWN_MASK;
954 tcp_set_state(sk,TCP_CLOSE);
955 return 1;
956 } 957 } 958
959 /* 960 * Incoming ACK to a FIN we sent in the case of our initiating the close. 961 * 962 * Move to FIN_WAIT2 to await a FIN from the other end. Set 963 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. 964 */ 965
966 if (sk->state == TCP_FIN_WAIT1)
967 { 968
969 if (!sk->dead)
970 sk->state_change(sk);
971 if (sk->rcv_ack_seq == sk->write_seq)
972 { 973 flag |= 1;
974 sk->shutdown |= SEND_SHUTDOWN;
975 tcp_set_state(sk, TCP_FIN_WAIT2);
976 } 977 } 978
979 /* 980 * Incoming ACK to a FIN we sent in the case of a simultaneous close. 981 * 982 * Move to TIME_WAIT 983 */ 984
985 if (sk->state == TCP_CLOSING)
986 { 987
988 if (!sk->dead)
989 sk->state_change(sk);
990 if (sk->rcv_ack_seq == sk->write_seq)
991 { 992 flag |= 1;
993 tcp_time_wait(sk);
994 } 995 } 996
997 /* 998 * Final ack of a three way shake 999 */1000
1001 if(sk->state==TCP_SYN_RECV)
1002 {1003 tcp_set_state(sk, TCP_ESTABLISHED);
1004 tcp_options(sk,th);
1005 sk->dummy_th.dest=th->source;
1006 sk->copied_seq = sk->acked_seq;
1007 if(!sk->dead)
1008 sk->state_change(sk);
1009 if(sk->max_window==0)
1010 {1011 sk->max_window=32; /* Sanity check */1012 sk->mss=min(sk->max_window,sk->mtu);
1013 }1014 }1015
1016 /*1017 * I make no guarantees about the first clause in the following1018 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under1019 * what conditions "!flag" would be true. However I think the rest1020 * of the conditions would prevent that from causing any1021 * unnecessary retransmission. 1022 * Clearly if the first packet has expired it should be 1023 * retransmitted. The other alternative, "flag&2 && retransmits", is1024 * harder to explain: You have to look carefully at how and when the1025 * timer is set and with what timeout. The most recent transmission always1026 * sets the timer. So in general if the most recent thing has timed1027 * out, everything before it has as well. So we want to go ahead and1028 * retransmit some more. If we didn't explicitly test for this1029 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"1030 * would not be true. If you look at the pattern of timing, you can1031 * show that rto is increased fast enough that the next packet would1032 * almost never be retransmitted immediately. Then you'd end up1033 * waiting for a timeout to send each packet on the retransmission1034 * queue. With my implementation of the Karn sampling algorithm,1035 * the timeout would double each time. The net result is that it would1036 * take a hideous amount of time to recover from a single dropped packet.1037 * It's possible that there should also be a test for TIME_WRITE, but1038 * I think as long as "send_head != NULL" and "retransmit" is on, we've1039 * got to be in real retransmission mode.1040 * Note that tcp_do_retransmit is called with all==1. Setting cong_window1041 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.1042 * As long as no further losses occur, this seems reasonable.1043 */1044
1045 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1046 (((flag&2) && sk->retransmits) ||
1047 (sk->send_head->when + sk->rto < jiffies)))
1048 {1049 if(sk->send_head->when + sk->rto < jiffies)
1050 tcp_retransmit(sk,0);
1051 else1052 {1053 tcp_do_retransmit(sk, 1);
1054 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1055 }1056 }1057
1058 return 1;
1059
1060 uninteresting_ack:
1061 if(sk->debug)
1062 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1063
1064 /*1065 * Keepalive processing.1066 */1067
1068 if (after(ack, sk->sent_seq))
1069 {1070 return 0;
1071 }1072
1073 /*1074 * Restart the keepalive timer.1075 */1076
1077 if (sk->keepopen)
1078 {1079 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1080 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1081 }1082 return 1;
1083 }1084
1085
1086 /*1087 * Process the FIN bit. This now behaves as it is supposed to work1088 * and the FIN takes effect when it is validly part of sequence1089 * space. Not before when we get holes.1090 *1091 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT1092 * (and thence onto LAST-ACK and finally, CLOSE, we never enter1093 * TIME-WAIT)1094 *1095 * If we are in FINWAIT-1, a received FIN indicates simultaneous1096 * close and we go into CLOSING (and later onto TIME-WAIT)1097 *1098 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.1099 *1100 */1101
1102 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */1103 {1104 sk->fin_seq = skb->end_seq;
1105
1106 if (!sk->dead)
1107 {1108 sk->state_change(sk);
1109 sock_wake_async(sk->socket, 1);
1110 }1111
1112 switch(sk->state)
1113 {1114 caseTCP_SYN_RECV:
1115 caseTCP_SYN_SENT:
1116 caseTCP_ESTABLISHED:
1117 /*1118 * move to CLOSE_WAIT, tcp_data() already handled1119 * sending the ack.1120 */1121 tcp_set_state(sk,TCP_CLOSE_WAIT);
1122 if (th->rst)
1123 sk->shutdown = SHUTDOWN_MASK;
1124 break;
1125
1126 caseTCP_CLOSE_WAIT:
1127 caseTCP_CLOSING:
1128 /*1129 * received a retransmission of the FIN, do1130 * nothing.1131 */1132 break;
1133 caseTCP_TIME_WAIT:
1134 /*1135 * received a retransmission of the FIN,1136 * restart the TIME_WAIT timer.1137 */1138 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1139 return(0);
1140 caseTCP_FIN_WAIT1:
1141 /*1142 * This case occurs when a simultaneous close1143 * happens, we must ack the received FIN and1144 * enter the CLOSING state.1145 *1146 * This causes a WRITE timeout, which will either1147 * move on to TIME_WAIT when we timeout, or resend1148 * the FIN properly (maybe we get rid of that annoying1149 * FIN lost hang). The TIME_WRITE code is already correct1150 * for handling this timeout.1151 */1152
1153 if(sk->ip_xmit_timeout != TIME_WRITE)
1154 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1155 tcp_set_state(sk,TCP_CLOSING);
1156 break;
1157 caseTCP_FIN_WAIT2:
1158 /*1159 * received a FIN -- send ACK and enter TIME_WAIT1160 */1161 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1162 sk->shutdown|=SHUTDOWN_MASK;
1163 tcp_set_state(sk,TCP_TIME_WAIT);
1164 break;
1165 caseTCP_CLOSE:
1166 /*1167 * already in CLOSE1168 */1169 break;
1170 default:
1171 tcp_set_state(sk,TCP_LAST_ACK);
1172
1173 /* Start the timers. */1174 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1175 return(0);
1176 }1177
1178 return(0);
1179 }1180
1181
1182
1183 /*1184 * This routine handles the data. If there is room in the buffer,1185 * it will be have already been moved into it. If there is no1186 * room, then we will just have to discard the packet.1187 */1188
1189 staticinttcp_data(structsk_buff *skb, structsock *sk,
/* */1190 unsignedlongsaddr, unsignedshortlen)
1191 {1192 structsk_buff *skb1, *skb2;
1193 structtcphdr *th;
1194 intdup_dumped=0;
1195 u32new_seq, shut_seq;
1196
1197 th = skb->h.th;
1198 skb_pull(skb,th->doff*4);
1199 skb_trim(skb,len-(th->doff*4));
1200
1201 /*1202 * The bytes in the receive read/assembly queue has increased. Needed for the1203 * low memory discard algorithm 1204 */1205
1206 sk->bytes_rcv += skb->len;
1207
1208 if (skb->len == 0 && !th->fin)
1209 {1210 /* 1211 * Don't want to keep passing ack's back and forth. 1212 * (someone sent us dataless, boring frame)1213 */1214 if (!th->ack)
1215 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1216 kfree_skb(skb, FREE_READ);
1217 return(0);
1218 }1219
1220 /*1221 * We no longer have anyone receiving data on this connection.1222 */1223
1224 #ifndef TCP_DONT_RST_SHUTDOWN
1225
1226 if(sk->shutdown & RCV_SHUTDOWN)
1227 {1228 /*1229 * FIXME: BSD has some magic to avoid sending resets to1230 * broken 4.2 BSD keepalives. Much to my surprise a few non1231 * BSD stacks still have broken keepalives so we want to1232 * cope with it.1233 */1234
1235 if(skb->len) /* We don't care if it's just an ack or1236 a keepalive/window probe */1237 {1238 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */1239
1240 /* Do this the way 4.4BSD treats it. Not what I'd1241 regard as the meaning of the spec but it's what BSD1242 does and clearly they know everything 8) */1243
1244 /*1245 * This is valid because of two things1246 *1247 * a) The way tcp_data behaves at the bottom.1248 * b) A fin takes effect when read not when received.1249 */1250
1251 shut_seq = sk->acked_seq+1; /* Last byte */1252
1253 if(after(new_seq,shut_seq))
1254 {1255 if(sk->debug)
1256 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1257 sk, new_seq, shut_seq, sk->blog);
1258 if(sk->dead)
1259 {1260 sk->acked_seq = new_seq + th->fin;
1261 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1262 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1263 tcp_statistics.TcpEstabResets++;
1264 sk->err = EPIPE;
1265 sk->error_report(sk);
1266 sk->shutdown = SHUTDOWN_MASK;
1267 tcp_set_state(sk,TCP_CLOSE);
1268 kfree_skb(skb, FREE_READ);
1269 return 0;
1270 }1271 }1272 }1273 }1274
1275 #endif1276
1277 /*1278 * Now we have to walk the chain, and figure out where this one1279 * goes into it. This is set up so that the last packet we received1280 * will be the first one we look at, that way if everything comes1281 * in order, there will be no performance loss, and if they come1282 * out of order we will be able to fit things in nicely.1283 *1284 * [AC: This is wrong. We should assume in order first and then walk1285 * forwards from the first hole based upon real traffic patterns.]1286 * 1287 */1288
1289 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */1290 {1291 skb_queue_head(&sk->receive_queue,skb);
1292 skb1= NULL;
1293 }1294 else1295 {1296 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
1297 {1298 if(sk->debug)
1299 {1300 printk("skb1=%p :", skb1);
1301 printk("skb1->seq = %d: ", skb1->seq);
1302 printk("skb->seq = %d\n",skb->seq);
1303 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
1304 sk->acked_seq);
1305 }1306
1307 /*1308 * Optimisation: Duplicate frame or extension of previous frame from1309 * same sequence point (lost ack case).1310 * The frame contains duplicate data or replaces a previous frame1311 * discard the previous frame (safe as sk->users is set) and put1312 * the new one in its place.1313 */1314
1315 if (skb->seq==skb1->seq && skb->len>=skb1->len)
1316 {1317 skb_append(skb1,skb);
1318 skb_unlink(skb1);
1319 kfree_skb(skb1,FREE_READ);
1320 dup_dumped=1;
1321 skb1=NULL;
1322 break;
1323 }1324
1325 /*1326 * Found where it fits1327 */1328
1329 if (after(skb->seq+1, skb1->seq))
1330 {1331 skb_append(skb1,skb);
1332 break;
1333 }1334
1335 /*1336 * See if we've hit the start. If so insert.1337 */1338 if (skb1 == skb_peek(&sk->receive_queue))
1339 {1340 skb_queue_head(&sk->receive_queue, skb);
1341 break;
1342 }1343 }1344 }1345
1346 /*1347 * Figure out what the ack value for this frame is1348 */1349
1350 if (before(sk->acked_seq, sk->copied_seq))
1351 {1352 printk("*** tcp.c:tcp_data bug acked < copied\n");
1353 sk->acked_seq = sk->copied_seq;
1354 }1355
1356 /*1357 * Now figure out if we can ack anything. This is very messy because we really want two1358 * receive queues, a completed and an assembly queue. We also want only one transmit1359 * queue.1360 */1361
1362 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
1363 {1364 if (before(skb->seq, sk->acked_seq+1))
1365 {1366
1367 if (after(skb->end_seq, sk->acked_seq))
1368 sk->acked_seq = skb->end_seq;
1369
1370 skb->acked = 1;
1371
1372 /*1373 * When we ack the fin, we do the FIN 1374 * processing.1375 */1376
1377 if (skb->h.th->fin)
1378 {1379 tcp_fin(skb,sk,skb->h.th);
1380 }1381
1382 for(skb2 = skb->next;
1383 skb2 != (structsk_buff *)&sk->receive_queue;
1384 skb2 = skb2->next)
1385 {1386 if (before(skb2->seq, sk->acked_seq+1))
1387 {1388 if (after(skb2->end_seq, sk->acked_seq))
1389 sk->acked_seq = skb2->end_seq;
1390
1391 skb2->acked = 1;
1392 /*1393 * When we ack the fin, we do1394 * the fin handling.1395 */1396 if (skb2->h.th->fin)
1397 {1398 tcp_fin(skb,sk,skb->h.th);
1399 }1400
1401 /*1402 * Force an immediate ack.1403 */1404
1405 sk->ack_backlog = sk->max_ack_backlog;
1406 }1407 else1408 {1409 break;
1410 }1411 }1412
1413 /*1414 * This also takes care of updating the window.1415 * This if statement needs to be simplified.1416 *1417 * rules for delaying an ack:1418 * - delay time <= 0.5 HZ1419 * - we don't have a window update to send1420 * - must send at least every 2 full sized packets1421 */1422 if (!sk->delay_acks ||
1423 sk->ack_backlog >= sk->max_ack_backlog ||
1424 sk->bytes_rcv > sk->max_unacked || th->fin ||
1425 sk->ato > HZ/2 ||
1426 tcp_raise_window(sk)) {1427 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */1428 }1429 else1430 {1431 sk->ack_backlog++;
1432
1433 if(sk->debug)
1434 printk("Ack queued.\n");
1435 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);
1436 }1437 }1438 }1439
1440 /*1441 * If we've missed a packet, send an ack.1442 * Also start a timer to send another.1443 */1444
1445 if (!skb->acked)
1446 {1447
1448 /*1449 * This is important. If we don't have much room left,1450 * we need to throw out a few packets so we have a good1451 * window. Note that mtu is used, not mss, because mss is really1452 * for the send side. He could be sending us stuff as large as mtu.1453 */1454
1455 while (sock_rspace(sk) < sk->mtu)
1456 {1457 skb1 = skb_peek(&sk->receive_queue);
1458 if (skb1 == NULL)
1459 {1460 printk("INET: tcp.c:tcp_data memory leak detected.\n");
1461 break;
1462 }1463
1464 /*1465 * Don't throw out something that has been acked. 1466 */1467
1468 if (skb1->acked)
1469 {1470 break;
1471 }1472
1473 skb_unlink(skb1);
1474 kfree_skb(skb1, FREE_READ);
1475 }1476 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1477 sk->ack_backlog++;
1478 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
1479 }1480 else1481 {1482 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1483 }1484
1485 /*1486 * Now tell the user we may have some data. 1487 */1488
1489 if (!sk->dead)
1490 {1491 if(sk->debug)
1492 printk("Data wakeup.\n");
1493 sk->data_ready(sk,0);
1494 }1495 return(0);
1496 }1497
1498
1499 /*1500 * This routine is only called when we have urgent data1501 * signalled. Its the 'slow' part of tcp_urg. It could be1502 * moved inline now as tcp_urg is only called from one1503 * place. We handle URGent data wrong. We have to - as1504 * BSD still doesn't use the correction from RFC961.1505 */1506
1507 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */1508 {1509 u32ptr = ntohs(th->urg_ptr);
1510
1511 if (ptr)
1512 ptr--;
1513 ptr += ntohl(th->seq);
1514
1515 /* ignore urgent data that we've already seen and read */1516 if (after(sk->copied_seq, ptr))
1517 return;
1518
1519 /* do we already have a newer (or duplicate) urgent pointer? */1520 if (sk->urg_data && !after(ptr, sk->urg_seq))
1521 return;
1522
1523 /* tell the world about our new urgent pointer */1524 if (sk->proc != 0) {1525 if (sk->proc > 0) {1526 kill_proc(sk->proc, SIGURG, 1);
1527 }else{1528 kill_pg(-sk->proc, SIGURG, 1);
1529 }1530 }1531 sk->urg_data = URG_NOTYET;
1532 sk->urg_seq = ptr;
1533 }1534
1535 /*1536 * This is the 'fast' part of urgent handling.1537 */1538
1539 staticinlinevoidtcp_urg(structsock *sk, structtcphdr *th, unsignedlonglen)
/* */1540 {1541 /*1542 * Check if we get a new urgent pointer - normally not 1543 */1544
1545 if (th->urg)
1546 tcp_check_urg(sk,th);
1547
1548 /*1549 * Do we wait for any urgent data? - normally not1550 */1551
1552 if (sk->urg_data == URG_NOTYET) {1553 u32ptr;
1554
1555 /*1556 * Is the urgent pointer pointing into this packet? 1557 */1558 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1559 if (ptr < len) {1560 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
1561 if (!sk->dead)
1562 sk->data_ready(sk,0);
1563 }1564 }1565 }1566
1567
1568 /*1569 * A TCP packet has arrived.1570 * skb->h.raw is the TCP header.1571 */1572
1573 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */1574 __u32daddr, unsignedshortlen,
1575 __u32saddr, intredo, structinet_protocol * protocol)
1576 {1577 structtcphdr *th;
1578 structsock *sk;
1579 intsyn_ok=0;
1580
1581 /*1582 * "redo" is 1 if we have already seen this skb but couldn't1583 * use it at that time (the socket was locked). In that case1584 * we have already done a lot of the work (looked up the socket1585 * etc).1586 */1587 th = skb->h.th;
1588 sk = skb->sk;
1589 if (!redo) {1590 tcp_statistics.TcpInSegs++;
1591 if (skb->pkt_type!=PACKET_HOST)
1592 gotodiscard_it;
1593
1594 /*1595 * Pull up the IP header.1596 */1597
1598 skb_pull(skb, skb->h.raw-skb->data);
1599
1600 /*1601 * Try to use the device checksum if provided.1602 */1603 switch (skb->ip_summed)
1604 {1605 caseCHECKSUM_NONE:
1606 skb->csum = csum_partial((char *)th, len, 0);
1607 caseCHECKSUM_HW:
1608 if (tcp_check(th, len, saddr, daddr, skb->csum))
1609 gotodiscard_it;
1610 default:
1611 /* CHECKSUM_UNNECESSARY */1612 }1613 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1614 if (!sk)
1615 gotono_tcp_socket;
1616 skb->sk = sk;
1617 skb->seq = ntohl(th->seq);
1618 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1619 skb->ack_seq = ntohl(th->ack_seq);
1620
1621 skb->acked = 0;
1622 skb->used = 0;
1623 skb->free = 1;
1624 skb->saddr = daddr;
1625 skb->daddr = saddr;
1626
1627 /* We may need to add it to the backlog here. */1628 if (sk->users)
1629 {1630 skb_queue_tail(&sk->back_log, skb);
1631 return(0);
1632 }1633 }1634
1635 /*1636 * If this socket has got a reset it's to all intents and purposes 1637 * really dead. Count closed sockets as dead.1638 *1639 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD1640 * simply drops data. This seems incorrect as a 'closed' TCP doesn't1641 * exist so should cause resets as if the port was unreachable.1642 */1643
1644 if (sk->zapped || sk->state==TCP_CLOSE)
1645 gotono_tcp_socket;
1646
1647 if (!sk->prot)
1648 {1649 printk("IMPOSSIBLE 3\n");
1650 return(0);
1651 }1652
1653
1654 /*1655 * Charge the memory to the socket. 1656 */1657
1658 skb->sk=sk;
1659 sk->rmem_alloc += skb->truesize;
1660
1661 /*1662 * We should now do header prediction.1663 */1664
1665 /*1666 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We1667 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug1668 * compatibility. We also set up variables more thoroughly [Karn notes in the1669 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].1670 */1671
1672 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */1673 {1674
1675 /*1676 * Now deal with unusual cases.1677 */1678
1679 if(sk->state==TCP_LISTEN)
1680 {1681 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */1682 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1683
1684 /*1685 * We don't care for RST, and non SYN are absorbed (old segments)1686 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the1687 * netmask on a running connection it can go broadcast. Even Sun's have1688 * this problem so I'm ignoring it 1689 */1690
1691 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1692 {1693 kfree_skb(skb, FREE_READ);
1694 return 0;
1695 }1696
1697 /* 1698 * Guess we need to make a new socket up 1699 */1700
1701 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1702
1703 /*1704 * Now we have several options: In theory there is nothing else1705 * in the frame. KA9Q has an option to send data with the syn,1706 * BSD accepts data with the syn up to the [to be] advertised window1707 * and Solaris 2.1 gives you a protocol error. For now we just ignore1708 * it, that fits the spec precisely and avoids incompatibilities. It1709 * would be nice in future to drop through and process the data.1710 *1711 * Now TTCP is starting to use we ought to queue this data.1712 */1713
1714 return 0;
1715 }1716
1717 /* 1718 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN1719 * then its a new connection1720 */1721
1722 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1723 {1724 kfree_skb(skb, FREE_READ);
1725 return 0;
1726 }1727
1728 /*1729 * SYN sent means we have to look for a suitable ack and either reset1730 * for bad matches or go to connected. The SYN_SENT case is unusual and should1731 * not be in line code. [AC]1732 */1733
1734 if(sk->state==TCP_SYN_SENT)
1735 {1736 /* Crossed SYN or previous junk segment */1737 if(th->ack)
1738 {1739 /* We got an ack, but it's not a good ack */1740 if(!tcp_ack(sk,th,skb->ack_seq,len))
1741 {1742 /* Reset the ack - its an ack from a 1743 different connection [ th->rst is checked in tcp_send_reset()] */1744 tcp_statistics.TcpAttemptFails++;
1745 tcp_send_reset(daddr, saddr, th,
1746 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1747 kfree_skb(skb, FREE_READ);
1748 return(0);
1749 }1750 if(th->rst)
1751 returntcp_reset(sk,skb);
1752 if(!th->syn)
1753 {1754 /* A valid ack from a different connection1755 start. Shouldn't happen but cover it */1756 tcp_statistics.TcpAttemptFails++;
1757 tcp_send_reset(daddr, saddr, th,
1758 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1759 kfree_skb(skb, FREE_READ);
1760 return 0;
1761 }1762 /*1763 * Ok.. it's good. Set up sequence numbers and1764 * move to established.1765 */1766 syn_ok=1; /* Don't reset this connection for the syn */1767 sk->acked_seq = skb->seq+1;
1768 sk->lastwin_seq = skb->seq+1;
1769 sk->fin_seq = skb->seq;
1770 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1771 tcp_set_state(sk, TCP_ESTABLISHED);
1772 tcp_options(sk,th);
1773 sk->dummy_th.dest=th->source;
1774 sk->copied_seq = sk->acked_seq;
1775 if(!sk->dead)
1776 {1777 sk->state_change(sk);
1778 sock_wake_async(sk->socket, 0);
1779 }1780 if(sk->max_window==0)
1781 {1782 sk->max_window = 32;
1783 sk->mss = min(sk->max_window, sk->mtu);
1784 }1785 }1786 else1787 {1788 /* See if SYN's cross. Drop if boring */1789 if(th->syn && !th->rst)
1790 {1791 /* Crossed SYN's are fine - but talking to1792 yourself is right out... */1793 if(sk->saddr==saddr && sk->daddr==daddr &&
1794 sk->dummy_th.source==th->source &&
1795 sk->dummy_th.dest==th->dest)
1796 {1797 tcp_statistics.TcpAttemptFails++;
1798 returntcp_reset(sk,skb);
1799 }1800 tcp_set_state(sk,TCP_SYN_RECV);
1801
1802 /*1803 * FIXME:1804 * Must send SYN|ACK here1805 */1806 }1807 /* Discard junk segment */1808 kfree_skb(skb, FREE_READ);
1809 return 0;
1810 }1811 /*1812 * SYN_RECV with data maybe.. drop through1813 */1814 gotorfc_step6;
1815 }1816
1817 /*1818 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is1819 * a more complex suggestion for fixing these reuse issues in RFC16441820 * but not yet ready for general use. Also see RFC1379.1821 *1822 * Note the funny way we go back to the top of this function for1823 * this case ("goto try_next_socket"). That also takes care of1824 * checking "sk->users" for the new socket as well as doing all1825 * the normal tests on the packet.1826 */1827
1828 #defineBSD_TIME_WAIT1829 #ifdefBSD_TIME_WAIT1830 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1831 after(skb->seq, sk->acked_seq) && !th->rst)
1832 {1833 u32seq = sk->write_seq;
1834 if(sk->debug)
1835 printk("Doing a BSD time wait\n");
1836 tcp_statistics.TcpEstabResets++;
1837 sk->rmem_alloc -= skb->truesize;
1838 skb->sk = NULL;
1839 sk->err=ECONNRESET;
1840 tcp_set_state(sk, TCP_CLOSE);
1841 sk->shutdown = SHUTDOWN_MASK;
1842 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1843 /* this is not really correct: we should check sk->users */1844 if (sk && sk->state==TCP_LISTEN)
1845 {1846 skb->sk = sk;
1847 sk->rmem_alloc += skb->truesize;
1848 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1849 return 0;
1850 }1851 kfree_skb(skb, FREE_READ);
1852 return 0;
1853 }1854 #endif1855 }1856
1857 /*1858 * We are now in normal data flow (see the step list in the RFC)1859 * Note most of these are inline now. I'll inline the lot when1860 * I have time to test it hard and look at what gcc outputs 1861 */1862
1863 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1864 {1865 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1866 kfree_skb(skb, FREE_READ);
1867 return 0;
1868 }1869
1870 if(th->rst)
1871 returntcp_reset(sk,skb);
1872
1873 /*1874 * !syn_ok is effectively the state test in RFC793.1875 */1876
1877 if(th->syn && !syn_ok)
1878 {1879 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1880 returntcp_reset(sk,skb);
1881 }1882
1883 tcp_delack_estimator(sk);
1884
1885 /*1886 * Process the ACK1887 */1888
1889
1890 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1891 {1892 /*1893 * Our three way handshake failed.1894 */1895
1896 if(sk->state==TCP_SYN_RECV)
1897 {1898 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1899 }1900 kfree_skb(skb, FREE_READ);
1901 return 0;
1902 }1903
1904 rfc_step6: /* I'll clean this up later */1905
1906 /*1907 * If the accepted buffer put us over our queue size we1908 * now drop it (we must process the ack first to avoid1909 * deadlock cases).1910 */1911
1912 if (sk->rmem_alloc >= sk->rcvbuf)
1913 {1914 kfree_skb(skb, FREE_READ);
1915 return(0);
1916 }1917
1918
1919 /*1920 * Process urgent data1921 */1922
1923 tcp_urg(sk, th, len);
1924
1925 /*1926 * Process the encapsulated data1927 */1928
1929 if(tcp_data(skb,sk, saddr, len))
1930 kfree_skb(skb, FREE_READ);
1931
1932 /*1933 * And done1934 */1935
1936 return 0;
1937
1938 no_tcp_socket:
1939 /*1940 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)1941 */1942 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1943
1944 discard_it:
1945 /*1946 * Discard frame1947 */1948 skb->sk = NULL;
1949 kfree_skb(skb, FREE_READ);
1950 return 0;
1951 }