1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp_input.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 */ 22
23 #include <linux/config.h>
24 #include <net/tcp.h>
25
26 /* 27 * Cached last hit socket 28 */ 29
30 staticvolatileunsignedlongth_cache_saddr,th_cache_daddr;
31 staticvolatileunsignedshortth_cache_dport, th_cache_sport;
32 staticvolatilestructsock *th_cache_sk;
33
34 voidtcp_cache_zap(void)
/* */ 35 { 36 th_cache_sk=NULL;
37 } 38
39 /* 40 * Find the socket, using the last hit cache if applicable. 41 */ 42 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */ 43 { 44 structsock * sk;
45
46 sk = (structsock *) th_cache_sk;
47 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
48 sport != th_cache_sport || dport != th_cache_dport) { 49 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
50 if (sk) { 51 th_cache_saddr=saddr;
52 th_cache_daddr=daddr;
53 th_cache_dport=dport;
54 th_cache_sport=sport;
55 th_cache_sk=sk;
56 } 57 } 58 returnsk;
59 } 60
61 /* 62 * React to a out-of-window TCP sequence number in an incoming packet 63 */ 64 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */ 65 structoptions *opt, unsignedlongsaddr, structdevice *dev)
66 { 67 if (th->rst)
68 return;
69
70 /* 71 * Send a reset if we get something not ours and we are 72 * unsynchronized. Note: We don't do anything to our end. We 73 * are just killing the bogus remote connection then we will 74 * connect again and it will work (with luck). 75 */ 76
77 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
78 { 79 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
80 return;
81 } 82
83 /* Try to resync things. */ 84 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
85 return;
86 } 87
88 /* 89 * This functions checks to see if the tcp header is actually acceptable. 90 */ 91
92 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */ 93 { 94 u32end_window = sk->acked_seq + sk->window;
95 return/* if start is at end of window, end must be too (zero window) */ 96 (seq == end_window && seq == end_seq) ||
97 /* if start is before end of window, check for interest */ 98 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
99 } 100
101 /* 102 * When we get a reset we do this. 103 */ 104
105 staticinttcp_reset(structsock *sk, structsk_buff *skb)
/* */ 106 { 107 sk->zapped = 1;
108 sk->err = ECONNRESET;
109 if (sk->state == TCP_SYN_SENT)
110 sk->err = ECONNREFUSED;
111 if (sk->state == TCP_CLOSE_WAIT)
112 sk->err = EPIPE;
113 #ifdef TCP_DO_RFC1337
114 /* 115 * Time wait assassination protection [RFC1337] 116 */ 117 if(sk->state!=TCP_TIME_WAIT)
118 { 119 tcp_set_state(sk,TCP_CLOSE);
120 sk->shutdown = SHUTDOWN_MASK;
121 } 122 #else 123 tcp_set_state(sk,TCP_CLOSE);
124 sk->shutdown = SHUTDOWN_MASK;
125 #endif 126 if (!sk->dead)
127 sk->state_change(sk);
128 kfree_skb(skb, FREE_READ);
129 release_sock(sk);
130 return(0);
131 } 132
133
134 /* 135 * Look for tcp options. Parses everything but only knows about MSS. 136 * This routine is always called with the packet containing the SYN. 137 * However it may also be called with the ack to the SYN. So you 138 * can't assume this is always the SYN. It's always called after 139 * we have set up sk->mtu to our own MTU. 140 * 141 * We need at minimum to add PAWS support here. Possibly large windows 142 * as Linux gets deployed on 100Mb/sec networks. 143 */ 144
145 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */ 146 { 147 unsignedchar *ptr;
148 intlength=(th->doff*4)-sizeof(structtcphdr);
149 intmss_seen = 0;
150
151 ptr = (unsignedchar *)(th + 1);
152
153 while(length>0)
154 { 155 intopcode=*ptr++;
156 intopsize=*ptr++;
157 switch(opcode)
158 { 159 caseTCPOPT_EOL:
160 return;
161 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 162 length--;
163 ptr--; /* the opsize=*ptr++ above was a mistake */ 164 continue;
165
166 default:
167 if(opsize<=2) /* Avoid silly options looping forever */ 168 return;
169 switch(opcode)
170 { 171 caseTCPOPT_MSS:
172 if(opsize==4 && th->syn)
173 { 174 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
175 mss_seen = 1;
176 } 177 break;
178 /* Add other options here as people feel the urge to implement stuff like large windows */ 179 } 180 ptr+=opsize-2;
181 length-=opsize;
182 } 183 } 184 if (th->syn)
185 { 186 if (! mss_seen)
187 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ 188 } 189 #ifdefCONFIG_INET_PCTCP 190 sk->mss = min(sk->max_window >> 1, sk->mtu);
191 #else 192 sk->mss = min(sk->max_window, sk->mtu);
193 sk->max_unacked = 2 * sk->mss;
194 #endif 195 } 196
197
198 /* 199 * This routine handles a connection request. 200 * It should make sure we haven't already responded. 201 * Because of the way BSD works, we have to send a syn/ack now. 202 * This also means it will be harder to close a socket which is 203 * listening. 204 */ 205
206 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */ 207 u32daddr, u32saddr, structoptions *opt, structdevice *dev, u32seq)
208 { 209 structsock *newsk;
210 structtcphdr *th;
211 structrtable *rt;
212
213 th = skb->h.th;
214
215 /* If the socket is dead, don't accept the connection. */ 216 if (!sk->dead)
217 { 218 sk->data_ready(sk,0);
219 } 220 else 221 { 222 if(sk->debug)
223 printk("Reset on %p: Connect on dead socket.\n",sk);
224 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
225 tcp_statistics.TcpAttemptFails++;
226 kfree_skb(skb, FREE_READ);
227 return;
228 } 229
230 /* 231 * Make sure we can accept more. This will prevent a 232 * flurry of syns from eating up all our memory. 233 */ 234
235 if (sk->ack_backlog >= sk->max_ack_backlog)
236 { 237 tcp_statistics.TcpAttemptFails++;
238 kfree_skb(skb, FREE_READ);
239 return;
240 } 241
242 /* 243 * We need to build a new sock struct. 244 * It is sort of bad to have a socket without an inode attached 245 * to it, but the wake_up's will just wake up the listening socket, 246 * and if the listening socket is destroyed before this is taken 247 * off of the queue, this will take care of it. 248 */ 249
250 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
251 if (newsk == NULL)
252 { 253 /* just ignore the syn. It will get retransmitted. */ 254 tcp_statistics.TcpAttemptFails++;
255 kfree_skb(skb, FREE_READ);
256 return;
257 } 258
259 memcpy(newsk, sk, sizeof(*newsk));
260 newsk->opt = NULL;
261 newsk->ip_route_cache = NULL;
262 if (opt && opt->optlen) { 263 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
264 if (!sk->opt) { 265 kfree_s(newsk, sizeof(structsock));
266 tcp_statistics.TcpAttemptFails++;
267 kfree_skb(skb, FREE_READ);
268 return;
269 } 270 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) { 271 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
272 kfree_s(newsk, sizeof(structsock));
273 tcp_statistics.TcpAttemptFails++;
274 kfree_skb(skb, FREE_READ);
275 return;
276 } 277 } 278 skb_queue_head_init(&newsk->write_queue);
279 skb_queue_head_init(&newsk->receive_queue);
280 newsk->send_head = NULL;
281 newsk->send_tail = NULL;
282 skb_queue_head_init(&newsk->back_log);
283 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ 284 newsk->rto = TCP_TIMEOUT_INIT;
285 newsk->mdev = 0;
286 newsk->max_window = 0;
287 newsk->cong_window = 1;
288 newsk->cong_count = 0;
289 newsk->ssthresh = 0;
290 newsk->backoff = 0;
291 newsk->blog = 0;
292 newsk->intr = 0;
293 newsk->proc = 0;
294 newsk->done = 0;
295 newsk->partial = NULL;
296 newsk->pair = NULL;
297 newsk->wmem_alloc = 0;
298 newsk->rmem_alloc = 0;
299 newsk->localroute = sk->localroute;
300
301 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
302
303 newsk->err = 0;
304 newsk->shutdown = 0;
305 newsk->ack_backlog = 0;
306 newsk->acked_seq = skb->seq+1;
307 newsk->lastwin_seq = skb->seq+1;
308 newsk->delay_acks = 1;
309 newsk->copied_seq = skb->seq+1;
310 newsk->fin_seq = skb->seq;
311 newsk->state = TCP_SYN_RECV;
312 newsk->timeout = 0;
313 newsk->ip_xmit_timeout = 0;
314 newsk->write_seq = seq;
315 newsk->window_seq = newsk->write_seq;
316 newsk->rcv_ack_seq = newsk->write_seq;
317 newsk->urg_data = 0;
318 newsk->retransmits = 0;
319 newsk->linger=0;
320 newsk->destroy = 0;
321 init_timer(&newsk->timer);
322 newsk->timer.data = (unsignedlong)newsk;
323 newsk->timer.function = &net_timer;
324 init_timer(&newsk->retransmit_timer);
325 newsk->retransmit_timer.data = (unsignedlong)newsk;
326 newsk->retransmit_timer.function=&tcp_retransmit_timer;
327 newsk->dummy_th.source = skb->h.th->dest;
328 newsk->dummy_th.dest = skb->h.th->source;
329
330 /* 331 * Swap these two, they are from our point of view. 332 */ 333
334 newsk->daddr = saddr;
335 newsk->saddr = daddr;
336 newsk->rcv_saddr = daddr;
337
338 put_sock(newsk->num,newsk);
339 newsk->dummy_th.res1 = 0;
340 newsk->dummy_th.doff = 6;
341 newsk->dummy_th.fin = 0;
342 newsk->dummy_th.syn = 0;
343 newsk->dummy_th.rst = 0;
344 newsk->dummy_th.psh = 0;
345 newsk->dummy_th.ack = 0;
346 newsk->dummy_th.urg = 0;
347 newsk->dummy_th.res2 = 0;
348 newsk->acked_seq = skb->seq + 1;
349 newsk->copied_seq = skb->seq + 1;
350 newsk->socket = NULL;
351
352 /* 353 * Grab the ttl and tos values and use them 354 */ 355
356 newsk->ip_ttl=sk->ip_ttl;
357 newsk->ip_tos=skb->ip_hdr->tos;
358
359 /* 360 * Use 512 or whatever user asked for 361 */ 362
363 /* 364 * Note use of sk->user_mss, since user has no direct access to newsk 365 */ 366
367 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
368 newsk->ip_route_cache = rt;
369
370 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
371 newsk->window_clamp = rt->rt_window;
372 else 373 newsk->window_clamp = 0;
374
375 if (sk->user_mss)
376 newsk->mtu = sk->user_mss;
377 elseif (rt)
378 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
379 else 380 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
381
382 /* 383 * But not bigger than device MTU 384 */ 385
386 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
387
388 #ifdefCONFIG_SKIP 389
390 /* 391 * SKIP devices set their MTU to 65535. This is so they can take packets 392 * unfragmented to security process then fragment. They could lie to the 393 * TCP layer about a suitable MTU, but its easier to let skip sort it out 394 * simply because the final package we want unfragmented is going to be 395 * 396 * [IPHDR][IPSP][Security data][Modified TCP data][Security data] 397 */ 398
399 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ 400 sk->mtu=skip_pick_mtu(sk->mtu,dev);
401 #endif 402 /* 403 * This will min with what arrived in the packet 404 */ 405
406 tcp_options(newsk,skb->h.th);
407
408 tcp_cache_zap();
409 tcp_send_synack(newsk, sk, skb);
410 } 411
412 /* 413 * This routine deals with incoming acks, but not outgoing ones. 414 */ 415
416 staticinttcp_ack(structsock *sk, structtcphdr *th, u32ack, intlen)
/* */ 417 { 418 intflag = 0;
419 unsignedwindow;
420
421 /* 422 * 1 - there was data in packet as well as ack or new data is sent or 423 * in shutdown state 424 * 2 - data from retransmit queue was acked and removed 425 * 4 - window shrunk or data from retransmit queue was acked and removed 426 */ 427
428 if(sk->zapped)
429 return(1); /* Dead, cant ack any more so why bother */ 430
431 /* 432 * Have we discovered a larger window 433 */ 434
435 window = ntohs(th->window);
436
437 if (window > sk->max_window)
438 { 439 sk->max_window = window;
440 #ifdefCONFIG_INET_PCTCP 441 /* Hack because we don't send partial packets to non SWS 442 handling hosts */ 443 sk->mss = min(window>>1, sk->mtu);
444 #else 445 sk->mss = min(window, sk->mtu);
446 #endif 447 } 448
449 /* 450 * We have dropped back to keepalive timeouts. Thus we have 451 * no retransmits pending. 452 */ 453
454 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
455 sk->retransmits = 0;
456
457 /* 458 * If the ack is newer than sent or older than previous acks 459 * then we can probably ignore it. 460 */ 461
462 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
463 { 464 if(sk->debug)
465 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
466
467 /* 468 * Keepalive processing. 469 */ 470
471 if (after(ack, sk->sent_seq))
472 { 473 return(0);
474 } 475
476 /* 477 * Restart the keepalive timer. 478 */ 479
480 if (sk->keepopen)
481 { 482 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
483 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
484 } 485 return(1);
486 } 487
488 /* 489 * If there is data set flag 1 490 */ 491
492 if (len != th->doff*4)
493 flag |= 1;
494
495 /* 496 * See if our window has been shrunk. 497 */ 498
499 if (after(sk->window_seq, ack+window))
500 { 501 /* 502 * We may need to move packets from the send queue 503 * to the write queue, if the window has been shrunk on us. 504 * The RFC says you are not allowed to shrink your window 505 * like this, but if the other end does, you must be able 506 * to deal with it. 507 */ 508 structsk_buff *skb;
509 structsk_buff *skb2;
510 structsk_buff *wskb = NULL;
511
512 skb2 = sk->send_head;
513 sk->send_head = NULL;
514 sk->send_tail = NULL;
515
516 /* 517 * This is an artifact of a flawed concept. We want one 518 * queue and a smarter send routine when we send all. 519 */ 520
521 flag |= 4; /* Window changed */ 522
523 sk->window_seq = ack + window;
524 cli();
525 while (skb2 != NULL)
526 { 527 skb = skb2;
528 skb2 = skb->link3;
529 skb->link3 = NULL;
530 if (after(skb->end_seq, sk->window_seq))
531 { 532 if (sk->packets_out > 0)
533 sk->packets_out--;
534 /* We may need to remove this from the dev send list. */ 535 if (skb->next != NULL)
536 { 537 skb_unlink(skb);
538 } 539 /* Now add it to the write_queue. */ 540 if (wskb == NULL)
541 skb_queue_head(&sk->write_queue,skb);
542 else 543 skb_append(wskb,skb);
544 wskb = skb;
545 } 546 else 547 { 548 if (sk->send_head == NULL)
549 { 550 sk->send_head = skb;
551 sk->send_tail = skb;
552 } 553 else 554 { 555 sk->send_tail->link3 = skb;
556 sk->send_tail = skb;
557 } 558 skb->link3 = NULL;
559 } 560 } 561 sti();
562 } 563
564 /* 565 * Pipe has emptied 566 */ 567
568 if (sk->send_tail == NULL || sk->send_head == NULL)
569 { 570 sk->send_head = NULL;
571 sk->send_tail = NULL;
572 sk->packets_out= 0;
573 } 574
575 /* 576 * Update the right hand window edge of the host 577 */ 578
579 sk->window_seq = ack + window;
580
581 /* 582 * We don't want too many packets out there. 583 */ 584
585 if (sk->ip_xmit_timeout == TIME_WRITE &&
586 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
587 { 588 /* 589 * This is Jacobson's slow start and congestion avoidance. 590 * SIGCOMM '88, p. 328. Because we keep cong_window in integral 591 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 592 * counter and increment it once every cwnd times. It's possible 593 * that this should be done only if sk->retransmits == 0. I'm 594 * interpreting "new data is acked" as including data that has 595 * been retransmitted but is just now being acked. 596 */ 597 if (sk->cong_window < sk->ssthresh)
598 /* 599 * In "safe" area, increase 600 */ 601 sk->cong_window++;
602 else 603 { 604 /* 605 * In dangerous area, increase slowly. In theory this is 606 * sk->cong_window += 1 / sk->cong_window 607 */ 608 if (sk->cong_count >= sk->cong_window)
609 { 610 sk->cong_window++;
611 sk->cong_count = 0;
612 } 613 else 614 sk->cong_count++;
615 } 616 } 617
618 /* 619 * Remember the highest ack received. 620 */ 621
622 sk->rcv_ack_seq = ack;
623
624 /* 625 * We passed data and got it acked, remove any soft error 626 * log. Something worked... 627 */ 628
629 sk->err_soft = 0;
630
631 /* 632 * If this ack opens up a zero window, clear backoff. It was 633 * being used to time the probes, and is probably far higher than 634 * it needs to be for normal retransmission. 635 */ 636
637 if (sk->ip_xmit_timeout == TIME_PROBE0)
638 { 639 sk->retransmits = 0; /* Our probe was answered */ 640
641 /* 642 * Was it a usable window open ? 643 */ 644
645 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */ 646 ! before (sk->window_seq, sk->write_queue.next->end_seq))
647 { 648 sk->backoff = 0;
649
650 /* 651 * Recompute rto from rtt. this eliminates any backoff. 652 */ 653
654 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
655 if (sk->rto > 120*HZ)
656 sk->rto = 120*HZ;
657 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about 658 .2 of a second because of BSD delayed acks - on a 100Mb/sec link 659 .2 of a second is going to need huge windows (SIGH) */ 660 sk->rto = HZ/5;
661 } 662 } 663
664 /* 665 * See if we can take anything off of the retransmit queue. 666 */ 667
668 while(sk->send_head != NULL)
669 { 670 /* Check for a bug. */ 671 if (sk->send_head->link3 &&
672 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
673 printk("INET: tcp.c: *** bug send_list out of order.\n");
674
675 /* 676 * If our packet is before the ack sequence we can 677 * discard it as it's confirmed to have arrived the other end. 678 */ 679
680 if (before(sk->send_head->end_seq, ack+1))
681 { 682 structsk_buff *oskb;
683 if (sk->retransmits)
684 { 685 /* 686 * We were retransmitting. don't count this in RTT est 687 */ 688 flag |= 2;
689
690 /* 691 * even though we've gotten an ack, we're still 692 * retransmitting as long as we're sending from 693 * the retransmit queue. Keeping retransmits non-zero 694 * prevents us from getting new data interspersed with 695 * retransmissions. 696 */ 697
698 if (sk->send_head->link3) /* Any more queued retransmits? */ 699 sk->retransmits = 1;
700 else 701 sk->retransmits = 0;
702 } 703 /* 704 * Note that we only reset backoff and rto in the 705 * rtt recomputation code. And that doesn't happen 706 * if there were retransmissions in effect. So the 707 * first new packet after the retransmissions is 708 * sent with the backoff still in effect. Not until 709 * we get an ack from a non-retransmitted packet do 710 * we reset the backoff and rto. This allows us to deal 711 * with a situation where the network delay has increased 712 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) 713 */ 714
715 /* 716 * We have one less packet out there. 717 */ 718
719 if (sk->packets_out > 0)
720 sk->packets_out --;
721
722 oskb = sk->send_head;
723
724 if (!(flag&2)) /* Not retransmitting */ 725 { 726 longm;
727
728 /* 729 * The following amusing code comes from Jacobson's 730 * article in SIGCOMM '88. Note that rtt and mdev 731 * are scaled versions of rtt and mean deviation. 732 * This is designed to be as fast as possible 733 * m stands for "measurement". 734 */ 735
736 m = jiffies - oskb->when; /* RTT */ 737 if(m<=0)
738 m=1; /* IS THIS RIGHT FOR <0 ??? */ 739 m -= (sk->rtt >> 3); /* m is now error in rtt est */ 740 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ 741 if (m < 0)
742 m = -m; /* m is now abs(error) */ 743 m -= (sk->mdev >> 2); /* similar update on mdev */ 744 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 745
746 /* 747 * Now update timeout. Note that this removes any backoff. 748 */ 749
750 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
751 if (sk->rto > 120*HZ)
752 sk->rto = 120*HZ;
753 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ 754 sk->rto = HZ/5;
755 sk->backoff = 0;
756 } 757 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 758 In this case as we just set it up */ 759 cli();
760 oskb = sk->send_head;
761 IS_SKB(oskb);
762 sk->send_head = oskb->link3;
763 if (sk->send_head == NULL)
764 { 765 sk->send_tail = NULL;
766 } 767
768 /* 769 * We may need to remove this from the dev send list. 770 */ 771
772 if (oskb->next)
773 skb_unlink(oskb);
774 sti();
775 kfree_skb(oskb, FREE_WRITE); /* write. */ 776 if (!sk->dead)
777 sk->write_space(sk);
778 } 779 else 780 { 781 break;
782 } 783 } 784
785 /* 786 * XXX someone ought to look at this too.. at the moment, if skb_peek() 787 * returns non-NULL, we complete ignore the timer stuff in the else 788 * clause. We ought to organize the code so that else clause can 789 * (should) be executed regardless, possibly moving the PROBE timer 790 * reset over. The skb_peek() thing should only move stuff to the 791 * write queue, NOT also manage the timer functions. 792 */ 793
794 /* 795 * Maybe we can take some stuff off of the write queue, 796 * and put it onto the xmit queue. 797 */ 798 if (skb_peek(&sk->write_queue) != NULL)
799 { 800 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
801 (sk->retransmits == 0 ||
802 sk->ip_xmit_timeout != TIME_WRITE ||
803 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
804 && sk->packets_out < sk->cong_window)
805 { 806 /* 807 * Add more data to the send queue. 808 */ 809 flag |= 1;
810 tcp_write_xmit(sk);
811 } 812 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
813 sk->send_head == NULL &&
814 sk->ack_backlog == 0 &&
815 sk->state != TCP_TIME_WAIT)
816 { 817 /* 818 * Data to queue but no room. 819 */ 820 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
821 } 822 } 823 else 824 { 825 /* 826 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets 827 * from TCP_CLOSE we don't do anything 828 * 829 * from anything else, if there is write data (or fin) pending, 830 * we use a TIME_WRITE timeout, else if keepalive we reset to 831 * a KEEPALIVE timeout, else we delete the timer. 832 * 833 * We do not set flag for nominal write data, otherwise we may 834 * force a state where we start to write itsy bitsy tidbits 835 * of data. 836 */ 837
838 switch(sk->state) { 839 caseTCP_TIME_WAIT:
840 /* 841 * keep us in TIME_WAIT until we stop getting packets, 842 * reset the timeout. 843 */ 844 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
845 break;
846 caseTCP_CLOSE:
847 /* 848 * don't touch the timer. 849 */ 850 break;
851 default:
852 /* 853 * Must check send_head, write_queue, and ack_backlog 854 * to determine which timeout to use. 855 */ 856 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) { 857 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
858 }elseif (sk->keepopen) { 859 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
860 }else{ 861 del_timer(&sk->retransmit_timer);
862 sk->ip_xmit_timeout = 0;
863 } 864 break;
865 } 866 } 867
868 /* 869 * We have nothing queued but space to send. Send any partial 870 * packets immediately (end of Nagle rule application). 871 */ 872
873 if (sk->packets_out == 0 && sk->partial != NULL &&
874 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
875 { 876 flag |= 1;
877 tcp_send_partial(sk);
878 } 879
880 /* 881 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and 882 * we are now waiting for an acknowledge to our FIN. The other end is 883 * already in TIME_WAIT. 884 * 885 * Move to TCP_CLOSE on success. 886 */ 887
888 if (sk->state == TCP_LAST_ACK)
889 { 890 if (!sk->dead)
891 sk->state_change(sk);
892 if(sk->debug)
893 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
894 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
895 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
896 { 897 flag |= 1;
898 sk->shutdown = SHUTDOWN_MASK;
899 tcp_set_state(sk,TCP_CLOSE);
900 return 1;
901 } 902 } 903
904 /* 905 * Incoming ACK to a FIN we sent in the case of our initiating the close. 906 * 907 * Move to FIN_WAIT2 to await a FIN from the other end. Set 908 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. 909 */ 910
911 if (sk->state == TCP_FIN_WAIT1)
912 { 913
914 if (!sk->dead)
915 sk->state_change(sk);
916 if (sk->rcv_ack_seq == sk->write_seq)
917 { 918 flag |= 1;
919 sk->shutdown |= SEND_SHUTDOWN;
920 tcp_set_state(sk, TCP_FIN_WAIT2);
921 } 922 } 923
924 /* 925 * Incoming ACK to a FIN we sent in the case of a simultaneous close. 926 * 927 * Move to TIME_WAIT 928 */ 929
930 if (sk->state == TCP_CLOSING)
931 { 932
933 if (!sk->dead)
934 sk->state_change(sk);
935 if (sk->rcv_ack_seq == sk->write_seq)
936 { 937 flag |= 1;
938 tcp_time_wait(sk);
939 } 940 } 941
942 /* 943 * Final ack of a three way shake 944 */ 945
946 if(sk->state==TCP_SYN_RECV)
947 { 948 tcp_set_state(sk, TCP_ESTABLISHED);
949 tcp_options(sk,th);
950 sk->dummy_th.dest=th->source;
951 sk->copied_seq = sk->acked_seq;
952 if(!sk->dead)
953 sk->state_change(sk);
954 if(sk->max_window==0)
955 { 956 sk->max_window=32; /* Sanity check */ 957 sk->mss=min(sk->max_window,sk->mtu);
958 } 959 } 960
961 /* 962 * I make no guarantees about the first clause in the following 963 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under 964 * what conditions "!flag" would be true. However I think the rest 965 * of the conditions would prevent that from causing any 966 * unnecessary retransmission. 967 * Clearly if the first packet has expired it should be 968 * retransmitted. The other alternative, "flag&2 && retransmits", is 969 * harder to explain: You have to look carefully at how and when the 970 * timer is set and with what timeout. The most recent transmission always 971 * sets the timer. So in general if the most recent thing has timed 972 * out, everything before it has as well. So we want to go ahead and 973 * retransmit some more. If we didn't explicitly test for this 974 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies" 975 * would not be true. If you look at the pattern of timing, you can 976 * show that rto is increased fast enough that the next packet would 977 * almost never be retransmitted immediately. Then you'd end up 978 * waiting for a timeout to send each packet on the retransmission 979 * queue. With my implementation of the Karn sampling algorithm, 980 * the timeout would double each time. The net result is that it would 981 * take a hideous amount of time to recover from a single dropped packet. 982 * It's possible that there should also be a test for TIME_WRITE, but 983 * I think as long as "send_head != NULL" and "retransmit" is on, we've 984 * got to be in real retransmission mode. 985 * Note that tcp_do_retransmit is called with all==1. Setting cong_window 986 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets. 987 * As long as no further losses occur, this seems reasonable. 988 */ 989
990 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
991 (((flag&2) && sk->retransmits) ||
992 (sk->send_head->when + sk->rto < jiffies)))
993 { 994 if(sk->send_head->when + sk->rto < jiffies)
995 tcp_retransmit(sk,0);
996 else 997 { 998 tcp_do_retransmit(sk, 1);
999 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1000 }1001 }1002
1003 return(1);
1004 }1005
1006
1007 /*1008 * Process the FIN bit. This now behaves as it is supposed to work1009 * and the FIN takes effect when it is validly part of sequence1010 * space. Not before when we get holes.1011 *1012 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT1013 * (and thence onto LAST-ACK and finally, CLOSE, we never enter1014 * TIME-WAIT)1015 *1016 * If we are in FINWAIT-1, a received FIN indicates simultaneous1017 * close and we go into CLOSING (and later onto TIME-WAIT)1018 *1019 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.1020 *1021 */1022
1023 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */1024 {1025 sk->fin_seq = skb->end_seq;
1026
1027 if (!sk->dead)
1028 {1029 sk->state_change(sk);
1030 sock_wake_async(sk->socket, 1);
1031 }1032
1033 switch(sk->state)
1034 {1035 caseTCP_SYN_RECV:
1036 caseTCP_SYN_SENT:
1037 caseTCP_ESTABLISHED:
1038 /*1039 * move to CLOSE_WAIT, tcp_data() already handled1040 * sending the ack.1041 */1042 tcp_set_state(sk,TCP_CLOSE_WAIT);
1043 if (th->rst)
1044 sk->shutdown = SHUTDOWN_MASK;
1045 break;
1046
1047 caseTCP_CLOSE_WAIT:
1048 caseTCP_CLOSING:
1049 /*1050 * received a retransmission of the FIN, do1051 * nothing.1052 */1053 break;
1054 caseTCP_TIME_WAIT:
1055 /*1056 * received a retransmission of the FIN,1057 * restart the TIME_WAIT timer.1058 */1059 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1060 return(0);
1061 caseTCP_FIN_WAIT1:
1062 /*1063 * This case occurs when a simultaneous close1064 * happens, we must ack the received FIN and1065 * enter the CLOSING state.1066 *1067 * This causes a WRITE timeout, which will either1068 * move on to TIME_WAIT when we timeout, or resend1069 * the FIN properly (maybe we get rid of that annoying1070 * FIN lost hang). The TIME_WRITE code is already correct1071 * for handling this timeout.1072 */1073
1074 if(sk->ip_xmit_timeout != TIME_WRITE)
1075 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1076 tcp_set_state(sk,TCP_CLOSING);
1077 break;
1078 caseTCP_FIN_WAIT2:
1079 /*1080 * received a FIN -- send ACK and enter TIME_WAIT1081 */1082 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1083 sk->shutdown|=SHUTDOWN_MASK;
1084 tcp_set_state(sk,TCP_TIME_WAIT);
1085 break;
1086 caseTCP_CLOSE:
1087 /*1088 * already in CLOSE1089 */1090 break;
1091 default:
1092 tcp_set_state(sk,TCP_LAST_ACK);
1093
1094 /* Start the timers. */1095 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1096 return(0);
1097 }1098
1099 return(0);
1100 }1101
1102
1103
1104 /*1105 * This routine handles the data. If there is room in the buffer,1106 * it will be have already been moved into it. If there is no1107 * room, then we will just have to discard the packet.1108 */1109
1110 staticinttcp_data(structsk_buff *skb, structsock *sk,
/* */1111 unsignedlongsaddr, unsignedshortlen)
1112 {1113 structsk_buff *skb1, *skb2;
1114 structtcphdr *th;
1115 intdup_dumped=0;
1116 u32new_seq, shut_seq;
1117
1118 th = skb->h.th;
1119 skb_pull(skb,th->doff*4);
1120 skb_trim(skb,len-(th->doff*4));
1121
1122 /*1123 * The bytes in the receive read/assembly queue has increased. Needed for the1124 * low memory discard algorithm 1125 */1126
1127 sk->bytes_rcv += skb->len;
1128
1129 if (skb->len == 0 && !th->fin)
1130 {1131 /* 1132 * Don't want to keep passing ack's back and forth. 1133 * (someone sent us dataless, boring frame)1134 */1135 if (!th->ack)
1136 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1137 kfree_skb(skb, FREE_READ);
1138 return(0);
1139 }1140
1141 /*1142 * We no longer have anyone receiving data on this connection.1143 */1144
1145 #ifndef TCP_DONT_RST_SHUTDOWN
1146
1147 if(sk->shutdown & RCV_SHUTDOWN)
1148 {1149 /*1150 * FIXME: BSD has some magic to avoid sending resets to1151 * broken 4.2 BSD keepalives. Much to my surprise a few non1152 * BSD stacks still have broken keepalives so we want to1153 * cope with it.1154 */1155
1156 if(skb->len) /* We don't care if it's just an ack or1157 a keepalive/window probe */1158 {1159 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */1160
1161 /* Do this the way 4.4BSD treats it. Not what I'd1162 regard as the meaning of the spec but it's what BSD1163 does and clearly they know everything 8) */1164
1165 /*1166 * This is valid because of two things1167 *1168 * a) The way tcp_data behaves at the bottom.1169 * b) A fin takes effect when read not when received.1170 */1171
1172 shut_seq = sk->acked_seq+1; /* Last byte */1173
1174 if(after(new_seq,shut_seq))
1175 {1176 if(sk->debug)
1177 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1178 sk, new_seq, shut_seq, sk->blog);
1179 if(sk->dead)
1180 {1181 sk->acked_seq = new_seq + th->fin;
1182 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1183 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1184 tcp_statistics.TcpEstabResets++;
1185 sk->err = EPIPE;
1186 sk->error_report(sk);
1187 sk->shutdown = SHUTDOWN_MASK;
1188 tcp_set_state(sk,TCP_CLOSE);
1189 kfree_skb(skb, FREE_READ);
1190 return 0;
1191 }1192 }1193 }1194 }1195
1196 #endif1197
1198 /*1199 * Now we have to walk the chain, and figure out where this one1200 * goes into it. This is set up so that the last packet we received1201 * will be the first one we look at, that way if everything comes1202 * in order, there will be no performance loss, and if they come1203 * out of order we will be able to fit things in nicely.1204 *1205 * [AC: This is wrong. We should assume in order first and then walk1206 * forwards from the first hole based upon real traffic patterns.]1207 * 1208 */1209
1210 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */1211 {1212 skb_queue_head(&sk->receive_queue,skb);
1213 skb1= NULL;
1214 }1215 else1216 {1217 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
1218 {1219 if(sk->debug)
1220 {1221 printk("skb1=%p :", skb1);
1222 printk("skb1->seq = %d: ", skb1->seq);
1223 printk("skb->seq = %d\n",skb->seq);
1224 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
1225 sk->acked_seq);
1226 }1227
1228 /*1229 * Optimisation: Duplicate frame or extension of previous frame from1230 * same sequence point (lost ack case).1231 * The frame contains duplicate data or replaces a previous frame1232 * discard the previous frame (safe as sk->inuse is set) and put1233 * the new one in its place.1234 */1235
1236 if (skb->seq==skb1->seq && skb->len>=skb1->len)
1237 {1238 skb_append(skb1,skb);
1239 skb_unlink(skb1);
1240 kfree_skb(skb1,FREE_READ);
1241 dup_dumped=1;
1242 skb1=NULL;
1243 break;
1244 }1245
1246 /*1247 * Found where it fits1248 */1249
1250 if (after(skb->seq+1, skb1->seq))
1251 {1252 skb_append(skb1,skb);
1253 break;
1254 }1255
1256 /*1257 * See if we've hit the start. If so insert.1258 */1259 if (skb1 == skb_peek(&sk->receive_queue))
1260 {1261 skb_queue_head(&sk->receive_queue, skb);
1262 break;
1263 }1264 }1265 }1266
1267 /*1268 * Figure out what the ack value for this frame is1269 */1270
1271 if (before(sk->acked_seq, sk->copied_seq))
1272 {1273 printk("*** tcp.c:tcp_data bug acked < copied\n");
1274 sk->acked_seq = sk->copied_seq;
1275 }1276
1277 /*1278 * Now figure out if we can ack anything. This is very messy because we really want two1279 * receive queues, a completed and an assembly queue. We also want only one transmit1280 * queue.1281 */1282
1283 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
1284 {1285 if (before(skb->seq, sk->acked_seq+1))
1286 {1287
1288 if (after(skb->end_seq, sk->acked_seq))
1289 sk->acked_seq = skb->end_seq;
1290
1291 skb->acked = 1;
1292
1293 /*1294 * When we ack the fin, we do the FIN 1295 * processing.1296 */1297
1298 if (skb->h.th->fin)
1299 {1300 tcp_fin(skb,sk,skb->h.th);
1301 }1302
1303 for(skb2 = skb->next;
1304 skb2 != (structsk_buff *)&sk->receive_queue;
1305 skb2 = skb2->next)
1306 {1307 if (before(skb2->seq, sk->acked_seq+1))
1308 {1309 if (after(skb2->end_seq, sk->acked_seq))
1310 sk->acked_seq = skb2->end_seq;
1311
1312 skb2->acked = 1;
1313 /*1314 * When we ack the fin, we do1315 * the fin handling.1316 */1317 if (skb2->h.th->fin)
1318 {1319 tcp_fin(skb,sk,skb->h.th);
1320 }1321
1322 /*1323 * Force an immediate ack.1324 */1325
1326 sk->ack_backlog = sk->max_ack_backlog;
1327 }1328 else1329 {1330 break;
1331 }1332 }1333
1334 /*1335 * This also takes care of updating the window.1336 * This if statement needs to be simplified.1337 *1338 * rules for delaying an ack:1339 * - delay time <= 0.5 HZ1340 * - we don't have a window update to send1341 * - must send at least every 2 full sized packets1342 */1343 if (!sk->delay_acks ||
1344 sk->ack_backlog >= sk->max_ack_backlog ||
1345 sk->bytes_rcv > sk->max_unacked || th->fin ||
1346 sk->ato > HZ/2 ||
1347 tcp_raise_window(sk)) {1348 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */1349 }1350 else1351 {1352 sk->ack_backlog++;
1353
1354 if(sk->debug)
1355 printk("Ack queued.\n");
1356 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);
1357 }1358 }1359 }1360
1361 /*1362 * If we've missed a packet, send an ack.1363 * Also start a timer to send another.1364 */1365
1366 if (!skb->acked)
1367 {1368
1369 /*1370 * This is important. If we don't have much room left,1371 * we need to throw out a few packets so we have a good1372 * window. Note that mtu is used, not mss, because mss is really1373 * for the send side. He could be sending us stuff as large as mtu.1374 */1375
1376 while (sock_rspace(sk) < sk->mtu)
1377 {1378 skb1 = skb_peek(&sk->receive_queue);
1379 if (skb1 == NULL)
1380 {1381 printk("INET: tcp.c:tcp_data memory leak detected.\n");
1382 break;
1383 }1384
1385 /*1386 * Don't throw out something that has been acked. 1387 */1388
1389 if (skb1->acked)
1390 {1391 break;
1392 }1393
1394 skb_unlink(skb1);
1395 kfree_skb(skb1, FREE_READ);
1396 }1397 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1398 sk->ack_backlog++;
1399 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
1400 }1401 else1402 {1403 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1404 }1405
1406 /*1407 * Now tell the user we may have some data. 1408 */1409
1410 if (!sk->dead)
1411 {1412 if(sk->debug)
1413 printk("Data wakeup.\n");
1414 sk->data_ready(sk,0);
1415 }1416 return(0);
1417 }1418
1419
1420 /*1421 * This routine is only called when we have urgent data1422 * signalled. Its the 'slow' part of tcp_urg. It could be1423 * moved inline now as tcp_urg is only called from one1424 * place. We handle URGent data wrong. We have to - as1425 * BSD still doesn't use the correction from RFC961.1426 */1427
1428 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */1429 {1430 u32ptr = ntohs(th->urg_ptr);
1431
1432 if (ptr)
1433 ptr--;
1434 ptr += ntohl(th->seq);
1435
1436 /* ignore urgent data that we've already seen and read */1437 if (after(sk->copied_seq, ptr))
1438 return;
1439
1440 /* do we already have a newer (or duplicate) urgent pointer? */1441 if (sk->urg_data && !after(ptr, sk->urg_seq))
1442 return;
1443
1444 /* tell the world about our new urgent pointer */1445 if (sk->proc != 0) {1446 if (sk->proc > 0) {1447 kill_proc(sk->proc, SIGURG, 1);
1448 }else{1449 kill_pg(-sk->proc, SIGURG, 1);
1450 }1451 }1452 sk->urg_data = URG_NOTYET;
1453 sk->urg_seq = ptr;
1454 }1455
1456 /*1457 * This is the 'fast' part of urgent handling.1458 */1459
1460 staticinlinevoidtcp_urg(structsock *sk, structtcphdr *th, unsignedlonglen)
/* */1461 {1462 /*1463 * Check if we get a new urgent pointer - normally not 1464 */1465
1466 if (th->urg)
1467 tcp_check_urg(sk,th);
1468
1469 /*1470 * Do we wait for any urgent data? - normally not1471 */1472
1473 if (sk->urg_data == URG_NOTYET) {1474 u32ptr;
1475
1476 /*1477 * Is the urgent pointer pointing into this packet? 1478 */1479 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1480 if (ptr < len) {1481 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
1482 if (!sk->dead)
1483 sk->data_ready(sk,0);
1484 }1485 }1486 }1487
1488
1489 /*1490 * A TCP packet has arrived.1491 * skb->h.raw is the TCP header.1492 */1493
1494 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */1495 __u32daddr, unsignedshortlen,
1496 __u32saddr, intredo, structinet_protocol * protocol)
1497 {1498 structtcphdr *th;
1499 structsock *sk;
1500 intsyn_ok=0;
1501
1502 /*1503 * "redo" is 1 if we have already seen this skb but couldn't1504 * use it at that time (the socket was locked). In that case1505 * we have already done a lot of the work (looked up the socket1506 * etc).1507 */1508 th = skb->h.th;
1509 sk = skb->sk;
1510 if (!redo) {1511 tcp_statistics.TcpInSegs++;
1512 if (skb->pkt_type!=PACKET_HOST)
1513 gotodiscard_it;
1514
1515 /*1516 * Pull up the IP header.1517 */1518 skb_pull(skb, skb->h.raw-skb->data);
1519
1520 /*1521 * Try to use the device checksum if provided.1522 */1523 switch (skb->ip_summed) {1524 caseCHECKSUM_NONE:
1525 skb->csum = csum_partial((char *)th, len, 0);
1526 caseCHECKSUM_HW:
1527 if (tcp_check(th, len, saddr, daddr, skb->csum))
1528 gotodiscard_it;
1529 default:
1530 /* CHECKSUM_UNNECESSARY */1531 }1532 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1533 if (!sk)
1534 gotono_tcp_socket;
1535 skb->sk = sk;
1536 skb->seq = ntohl(th->seq);
1537 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1538 skb->ack_seq = ntohl(th->ack_seq);
1539
1540 skb->acked = 0;
1541 skb->used = 0;
1542 skb->free = 0;
1543 skb->saddr = daddr;
1544 skb->daddr = saddr;
1545
1546 /* We may need to add it to the backlog here. */1547 cli();
1548 if (sk->inuse)
1549 {1550 skb_queue_tail(&sk->back_log, skb);
1551 sti();
1552 return(0);
1553 }1554 sk->inuse = 1;
1555 sti();
1556 }1557
1558 /*1559 * If this socket has got a reset it's to all intents and purposes 1560 * really dead. Count closed sockets as dead.1561 *1562 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD1563 * simply drops data. This seems incorrect as a 'closed' TCP doesn't1564 * exist so should cause resets as if the port was unreachable.1565 */1566
1567 if (sk->zapped || sk->state==TCP_CLOSE)
1568 gotono_tcp_socket;
1569
1570 if (!sk->prot)
1571 {1572 printk("IMPOSSIBLE 3\n");
1573 return(0);
1574 }1575
1576
1577 /*1578 * Charge the memory to the socket. 1579 */1580
1581 skb->sk=sk;
1582 sk->rmem_alloc += skb->truesize;
1583
1584 /*1585 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We1586 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug1587 * compatibility. We also set up variables more thoroughly [Karn notes in the1588 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].1589 */1590
1591 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */1592 {1593
1594 /*1595 * Now deal with unusual cases.1596 */1597
1598 if(sk->state==TCP_LISTEN)
1599 {1600 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */1601 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1602
1603 /*1604 * We don't care for RST, and non SYN are absorbed (old segments)1605 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the1606 * netmask on a running connection it can go broadcast. Even Sun's have1607 * this problem so I'm ignoring it 1608 */1609
1610 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1611 {1612 kfree_skb(skb, FREE_READ);
1613 release_sock(sk);
1614 return 0;
1615 }1616
1617 /* 1618 * Guess we need to make a new socket up 1619 */1620
1621 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1622
1623 /*1624 * Now we have several options: In theory there is nothing else1625 * in the frame. KA9Q has an option to send data with the syn,1626 * BSD accepts data with the syn up to the [to be] advertised window1627 * and Solaris 2.1 gives you a protocol error. For now we just ignore1628 * it, that fits the spec precisely and avoids incompatibilities. It1629 * would be nice in future to drop through and process the data.1630 */1631
1632 release_sock(sk);
1633 return 0;
1634 }1635
1636 /* retransmitted SYN? */1637 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1638 {1639 kfree_skb(skb, FREE_READ);
1640 release_sock(sk);
1641 return 0;
1642 }1643
1644 /*1645 * SYN sent means we have to look for a suitable ack and either reset1646 * for bad matches or go to connected 1647 */1648
1649 if(sk->state==TCP_SYN_SENT)
1650 {1651 /* Crossed SYN or previous junk segment */1652 if(th->ack)
1653 {1654 /* We got an ack, but it's not a good ack */1655 if(!tcp_ack(sk,th,skb->ack_seq,len))
1656 {1657 /* Reset the ack - its an ack from a 1658 different connection [ th->rst is checked in tcp_send_reset()] */1659 tcp_statistics.TcpAttemptFails++;
1660 tcp_send_reset(daddr, saddr, th,
1661 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1662 kfree_skb(skb, FREE_READ);
1663 release_sock(sk);
1664 return(0);
1665 }1666 if(th->rst)
1667 returntcp_reset(sk,skb);
1668 if(!th->syn)
1669 {1670 /* A valid ack from a different connection1671 start. Shouldn't happen but cover it */1672 tcp_statistics.TcpAttemptFails++;
1673 tcp_send_reset(daddr, saddr, th,
1674 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1675 kfree_skb(skb, FREE_READ);
1676 release_sock(sk);
1677 return 0;
1678 }1679 /*1680 * Ok.. it's good. Set up sequence numbers and1681 * move to established.1682 */1683 syn_ok=1; /* Don't reset this connection for the syn */1684 sk->acked_seq = skb->seq+1;
1685 sk->lastwin_seq = skb->seq+1;
1686 sk->fin_seq = skb->seq;
1687 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1688 tcp_set_state(sk, TCP_ESTABLISHED);
1689 tcp_options(sk,th);
1690 sk->dummy_th.dest=th->source;
1691 sk->copied_seq = sk->acked_seq;
1692 if(!sk->dead)
1693 {1694 sk->state_change(sk);
1695 sock_wake_async(sk->socket, 0);
1696 }1697 if(sk->max_window==0)
1698 {1699 sk->max_window = 32;
1700 sk->mss = min(sk->max_window, sk->mtu);
1701 }1702 }1703 else1704 {1705 /* See if SYN's cross. Drop if boring */1706 if(th->syn && !th->rst)
1707 {1708 /* Crossed SYN's are fine - but talking to1709 yourself is right out... */1710 if(sk->saddr==saddr && sk->daddr==daddr &&
1711 sk->dummy_th.source==th->source &&
1712 sk->dummy_th.dest==th->dest)
1713 {1714 tcp_statistics.TcpAttemptFails++;
1715 returntcp_reset(sk,skb);
1716 }1717 tcp_set_state(sk,TCP_SYN_RECV);
1718
1719 /*1720 * FIXME:1721 * Must send SYN|ACK here1722 */1723 }1724 /* Discard junk segment */1725 kfree_skb(skb, FREE_READ);
1726 release_sock(sk);
1727 return 0;
1728 }1729 /*1730 * SYN_RECV with data maybe.. drop through1731 */1732 gotorfc_step6;
1733 }1734
1735 /*1736 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is1737 * a more complex suggestion for fixing these reuse issues in RFC16441738 * but not yet ready for general use. Also see RFC1379.1739 */1740
1741 #defineBSD_TIME_WAIT1742 #ifdefBSD_TIME_WAIT1743 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1744 after(skb->seq, sk->acked_seq) && !th->rst)
1745 {1746 u32seq = sk->write_seq;
1747 if(sk->debug)
1748 printk("Doing a BSD time wait\n");
1749 tcp_statistics.TcpEstabResets++;
1750 sk->rmem_alloc -= skb->truesize;
1751 skb->sk = NULL;
1752 sk->err=ECONNRESET;
1753 tcp_set_state(sk, TCP_CLOSE);
1754 sk->shutdown = SHUTDOWN_MASK;
1755 release_sock(sk);
1756 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1757 if (sk && sk->state==TCP_LISTEN)
1758 {1759 sk->inuse=1;
1760 skb->sk = sk;
1761 sk->rmem_alloc += skb->truesize;
1762 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1763 release_sock(sk);
1764 return 0;
1765 }1766 kfree_skb(skb, FREE_READ);
1767 return 0;
1768 }1769 #endif1770 }1771
1772 /*1773 * We are now in normal data flow (see the step list in the RFC)1774 * Note most of these are inline now. I'll inline the lot when1775 * I have time to test it hard and look at what gcc outputs 1776 */1777
1778 if (!tcp_sequence(sk, skb->seq, skb->end_seq))
1779 {1780 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1781 kfree_skb(skb, FREE_READ);
1782 release_sock(sk);
1783 return 0;
1784 }1785
1786 if(th->rst)
1787 returntcp_reset(sk,skb);
1788
1789 /*1790 * !syn_ok is effectively the state test in RFC793.1791 */1792
1793 if(th->syn && !syn_ok)
1794 {1795 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1796 returntcp_reset(sk,skb);
1797 }1798
1799
1800 /*1801 * Delayed ACK time estimator.1802 */1803
1804 if (sk->lrcvtime == 0)
1805 {1806 sk->lrcvtime = jiffies;
1807 sk->ato = HZ/3;
1808 }1809 else1810 {1811 intm;
1812
1813 m = jiffies - sk->lrcvtime;
1814
1815 sk->lrcvtime = jiffies;
1816
1817 if (m <= 0)
1818 m = 1;
1819
1820 if (m > (sk->rtt >> 3))
1821 {1822 sk->ato = sk->rtt >> 3;
1823 /*1824 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);1825 */1826 }1827 else1828 {1829 sk->ato = (sk->ato >> 1) + m;
1830 /*1831 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);1832 */1833 }1834 }1835
1836 /*1837 * Process the ACK1838 */1839
1840
1841 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1842 {1843 /*1844 * Our three way handshake failed.1845 */1846
1847 if(sk->state==TCP_SYN_RECV)
1848 {1849 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1850 }1851 kfree_skb(skb, FREE_READ);
1852 release_sock(sk);
1853 return 0;
1854 }1855
1856 rfc_step6: /* I'll clean this up later */1857
1858 /*1859 * If the accepted buffer put us over our queue size we1860 * now drop it (we must process the ack first to avoid1861 * deadlock cases).1862 */1863
1864 if (sk->rmem_alloc >= sk->rcvbuf)
1865 {1866 kfree_skb(skb, FREE_READ);
1867 release_sock(sk);
1868 return(0);
1869 }1870
1871
1872 /*1873 * Process urgent data1874 */1875
1876 tcp_urg(sk, th, len);
1877
1878 /*1879 * Process the encapsulated data1880 */1881
1882 if(tcp_data(skb,sk, saddr, len))
1883 {1884 kfree_skb(skb, FREE_READ);
1885 release_sock(sk);
1886 return 0;
1887 }1888
1889 /*1890 * And done1891 */1892
1893 release_sock(sk);
1894 return 0;
1895
1896 no_tcp_socket:
1897 /*1898 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)1899 */1900 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1901
1902 discard_it:
1903 /*1904 * Discard frame1905 */1906 skb->sk = NULL;
1907 kfree_skb(skb, FREE_READ);
1908 return 0;
1909 }