1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 26 * and was trying to connect (tcp_err()). 27 * Alan Cox : All icmp error handling was broken 28 * pointers passed where wrong and the 29 * socket was looked up backwards. Nobody 30 * tested any icmp error code obviously. 31 * Alan Cox : tcp_err() now handled properly. It wakes people 32 * on errors. select behaves and the icmp error race 33 * has gone by moving it into sock.c 34 * Alan Cox : tcp_reset() fixed to work for everything not just 35 * packets for unknown sockets. 36 * Alan Cox : tcp option processing. 37 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] 38 * Herp Rosmanith : More reset fixes 39 * Alan Cox : No longer acks invalid rst frames. Acking 40 * any kind of RST is right out. 41 * Alan Cox : Sets an ignore me flag on an rst receive 42 * otherwise odd bits of prattle escape still 43 * Alan Cox : Fixed another acking RST frame bug. Should stop 44 * LAN workplace lockups. 45 * Alan Cox : Some tidyups using the new skb list facilities 46 * Alan Cox : sk->keepopen now seems to work 47 * Alan Cox : Pulls options out correctly on accepts 48 * Alan Cox : Fixed assorted sk->rqueue->next errors 49 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. 50 * Alan Cox : Tidied tcp_data to avoid a potential nasty. 51 * Alan Cox : Added some better commenting, as the tcp is hard to follow 52 * Alan Cox : Removed incorrect check for 20 * psh 53 * Michael O'Reilly : ack < copied bug fix. 54 * Johannes Stille : Misc tcp fixes (not all in yet). 55 * Alan Cox : FIN with no memory -> CRASH 56 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. 57 * Alan Cox : Added TCP options (SOL_TCP) 58 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. 59 * Alan Cox : Use ip_tos/ip_ttl settings. 60 * Alan Cox : Handle FIN (more) properly (we hope). 61 * Alan Cox : RST frames sent on unsynchronised state ack error/ 62 * Alan Cox : Put in missing check for SYN bit. 63 * Alan Cox : Added tcp_select_window() aka NET2E 64 * window non shrink trick. 65 * Alan Cox : Added a couple of small NET2E timer fixes 66 * Charles Hedrick : TCP fixes 67 * Toomas Tamm : TCP window fixes 68 * Alan Cox : Small URG fix to rlogin ^C ack fight 69 * Charles Hedrick : Rewrote most of it to actually work 70 * Linus : Rewrote tcp_read() and URG handling 71 * completely 72 * Gerhard Koerting: Fixed some missing timer handling 73 * Matthew Dillon : Reworked TCP machine states as per RFC 74 * Gerhard Koerting: PC/TCP workarounds 75 * Adam Caldwell : Assorted timer/timing errors 76 * Matthew Dillon : Fixed another RST bug 77 * Alan Cox : Move to kernel side addressing changes. 78 * Alan Cox : Beginning work on TCP fastpathing (not yet usable) 79 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 80 * Alan Cox : TCP fast path debugging 81 * Alan Cox : Window clamping 82 * Michael Riepe : Bug in tcp_check() 83 * Matt Dillon : More TCP improvements and RST bug fixes 84 * Matt Dillon : Yet more small nasties remove from the TCP code 85 * (Be very nice to this man if tcp finally works 100%) 8) 86 * Alan Cox : BSD accept semantics. 87 * Alan Cox : Reset on closedown bug. 88 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 89 * Michael Pall : Handle select() after URG properly in all cases. 90 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). 91 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. 92 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. 93 * Alan Cox : Changed the semantics of sk->socket to 94 * fix a race and a signal problem with 95 * accept() and async I/O. 96 * Alan Cox : Relaxed the rules on tcp_sendto(). 97 * Yury Shevchuk : Really fixed accept() blocking problem. 98 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 99 * clients/servers which listen in on 100 * fixed ports. 101 * Alan Cox : Cleaned the above up and shrank it to 102 * a sensible code size. 103 * Alan Cox : Self connect lockup fix. 104 * Alan Cox : No connect to multicast. 105 * Ross Biro : Close unaccepted children on master 106 * socket close. 107 * Alan Cox : Reset tracing code. 108 * Alan Cox : Spurious resets on shutdown. 109 * Alan Cox : Giant 15 minute/60 second timer error 110 * Alan Cox : Small whoops in selecting before an accept. 111 * Alan Cox : Kept the state trace facility since it's 112 * handy for debugging. 113 * Alan Cox : More reset handler fixes. 114 * Alan Cox : Started rewriting the code based on the RFC's 115 * for other useful protocol references see: 116 * Comer, KA9Q NOS, and for a reference on the 117 * difference between specifications and how BSD 118 * works see the 4.4lite source. 119 * A.N.Kuznetsov : Don't time wait on completion of tidy 120 * close. 121 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 122 * Linus Torvalds : Fixed BSD port reuse to work first syn 123 * Alan Cox : Reimplemented timers as per the RFC and using multiple 124 * timers for sanity. 125 * Alan Cox : Small bug fixes, and a lot of new 126 * comments. 127 * Alan Cox : Fixed dual reader crash by locking 128 * the buffers (much like datagram.c) 129 * Alan Cox : Fixed stuck sockets in probe. A probe 130 * now gets fed up of retrying without 131 * (even a no space) answer. 132 * Alan Cox : Extracted closing code better 133 * Alan Cox : Fixed the closing state machine to 134 * resemble the RFC. 135 * Alan Cox : More 'per spec' fixes. 136 * Jorge Cwik : Even faster checksumming. 137 * Alan Cox : tcp_data() doesn't ack illegal PSH 138 * only frames. At least one pc tcp stack 139 * generates them. 140 * Alan Cox : Cache last socket. 141 * Alan Cox : Per route irtt. 142 * Matt Day : Select() match BSD precisely on error 143 * Alan Cox : New buffers 144 * Mark Tamsky : Various sk->prot->retransmits and 145 * sk->retransmits misupdating fixed. 146 * Fixed tcp_write_timeout: stuck close, 147 * and TCP syn retries gets used now. 148 * Mark Yarvis : In tcp_read_wakeup(), don't send an 149 * ack if stat is TCP_CLOSED. 150 * Alan Cox : Look up device on a retransmit - routes may 151 * change. Doesn't yet cope with MSS shrink right 152 * but its a start! 153 * 154 * 155 * To Fix: 156 * Fast path the code. Two things here - fix the window calculation 157 * so it doesn't iterate over the queue, also spot packets with no funny 158 * options arriving in order and process directly. 159 * 160 * Implement RFC 1191 [Path MTU discovery] 161 * Look at the effect of implementing RFC 1337 suggestions and their impact. 162 * Rewrite output state machine to use a single queue and do low window 163 * situations as per the spec (RFC 1122) 164 * Speed up input assembly algorithm. 165 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 166 * could do with it working on IPv4 167 * User settable/learned rtt/max window/mtu 168 * Cope with MTU/device switches when retransmitting in tcp. 169 * Fix the window handling to use PR's new code. 170 * 171 * Change the fundamental structure to a single send queue maintained 172 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 173 * active routes too]). Cut the queue off in tcp_retransmit/ 174 * tcp_transmit. 175 * Change the receive queue to assemble as it goes. This lets us 176 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 177 * tcp_data/tcp_read as well as the window shrink crud. 178 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 179 * tcp_queue_skb seem obvious routines to extract. 180 * 181 * This program is free software; you can redistribute it and/or 182 * modify it under the terms of the GNU General Public License 183 * as published by the Free Software Foundation; either version 184 * 2 of the License, or(at your option) any later version. 185 * 186 * Description of States: 187 * 188 * TCP_SYN_SENT sent a connection request, waiting for ack 189 * 190 * TCP_SYN_RECV received a connection request, sent ack, 191 * waiting for final ack in three-way handshake. 192 * 193 * TCP_ESTABLISHED connection established 194 * 195 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 196 * transmission of remaining buffered data 197 * 198 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 199 * to shutdown 200 * 201 * TCP_CLOSING both sides have shutdown but we still have 202 * data we have to finish sending 203 * 204 * TCP_TIME_WAIT timeout to catch resent junk before entering 205 * closed, can only be entered from FIN_WAIT2 206 * or CLOSING. Required because the other end 207 * may not have gotten our last ACK causing it 208 * to retransmit the data packet (which we ignore) 209 * 210 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 211 * us to finish writing our data and to shutdown 212 * (we have to close() to move on to LAST_ACK) 213 * 214 * TCP_LAST_ACK out side has shutdown after remote has 215 * shutdown. There may still be data in our 216 * buffer that we have to finish sending 217 * 218 * TCP_CLOSE socket is finished 219 */ 220
221 #include <linux/types.h>
222 #include <linux/sched.h>
223 #include <linux/mm.h>
224 #include <linux/time.h>
225 #include <linux/string.h>
226 #include <linux/config.h>
227 #include <linux/socket.h>
228 #include <linux/sockios.h>
229 #include <linux/termios.h>
230 #include <linux/in.h>
231 #include <linux/fcntl.h>
232 #include <linux/inet.h>
233 #include <linux/netdevice.h>
234 #include <net/snmp.h>
235 #include <net/ip.h>
236 #include <net/protocol.h>
237 #include <net/icmp.h>
238 #include <net/tcp.h>
239 #include <net/arp.h>
240 #include <linux/skbuff.h>
241 #include <net/sock.h>
242 #include <net/route.h>
243 #include <linux/errno.h>
244 #include <linux/timer.h>
245 #include <asm/system.h>
246 #include <asm/segment.h>
247 #include <linux/mm.h>
248 #include <net/checksum.h>
249
250 /* 251 * The MSL timer is the 'normal' timer. 252 */ 253
254 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
255
256 #define SEQ_TICK 3
257 unsignedlongseq_offset;
258 structtcp_mibtcp_statistics;
259
260 /* 261 * Cached last hit socket 262 */ 263
264 volatileunsignedlongth_cache_saddr,th_cache_daddr;
265 volatileunsignedshortth_cache_dport, th_cache_sport;
266 volatilestructsock *th_cache_sk;
267
268 voidtcp_cache_zap(void)
/* */ 269 { 270 unsignedlongflags;
271 save_flags(flags);
272 cli();
273 th_cache_saddr=0;
274 th_cache_daddr=0;
275 th_cache_dport=0;
276 th_cache_sport=0;
277 th_cache_sk=NULL;
278 restore_flags(flags);
279 } 280
281 staticvoidtcp_close(structsock *sk, inttimeout);
282
283
284 /* 285 * The less said about this the better, but it works and will do for 1.2 286 */ 287
288 staticstructwait_queue *master_select_wakeup;
289
290 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 291 { 292 if (a < b)
293 return(a);
294 return(b);
295 } 296
297 #undefSTATE_TRACE 298
299 #ifdefSTATE_TRACE 300 staticchar *statename[]={ 301 "Unused","Established","Syn Sent","Syn Recv",
302 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
303 "Close Wait","Last ACK","Listen","Closing"
304 };
305 #endif 306
307 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 308 { 309 if(sk->state==TCP_ESTABLISHED)
310 tcp_statistics.TcpCurrEstab--;
311 #ifdefSTATE_TRACE 312 if(sk->debug)
313 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
314 #endif 315 /* This is a hack but it doesn't occur often and it's going to 316 be a real to fix nicely */ 317
318 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
319 { 320 wake_up_interruptible(&master_select_wakeup);
321 } 322 sk->state=state;
323 if(state==TCP_ESTABLISHED)
324 tcp_statistics.TcpCurrEstab++;
325 } 326
327 /* 328 * This routine picks a TCP windows for a socket based on 329 * the following constraints 330 * 331 * 1. The window can never be shrunk once it is offered (RFC 793) 332 * 2. We limit memory per socket 333 * 334 * For now we use NET2E3's heuristic of offering half the memory 335 * we have handy. All is not as bad as this seems however because 336 * of two things. Firstly we will bin packets even within the window 337 * in order to get the data we are waiting for into the memory limit. 338 * Secondly we bin common duplicate forms at receive time 339 * Better heuristics welcome 340 */ 341
342 inttcp_select_window(structsock *sk)
/* */ 343 { 344 intnew_window = sk->prot->rspace(sk);
345
346 if(sk->window_clamp)
347 new_window=min(sk->window_clamp,new_window);
348 /* 349 * Two things are going on here. First, we don't ever offer a 350 * window less than min(sk->mss, MAX_WINDOW/2). This is the 351 * receiver side of SWS as specified in RFC1122. 352 * Second, we always give them at least the window they 353 * had before, in order to avoid retracting window. This 354 * is technically allowed, but RFC1122 advises against it and 355 * in practice it causes trouble. 356 * 357 * Fixme: This doesn't correctly handle the case where 358 * new_window > sk->window but not by enough to allow for the 359 * shift in sequence space. 360 */ 361 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
362 return(sk->window);
363 return(new_window);
364 } 365
366 /* 367 * Find someone to 'accept'. Must be called with 368 * sk->inuse=1 or cli() 369 */ 370
371 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 372 { 373 structsk_buff *p=skb_peek(&s->receive_queue);
374 if(p==NULL)
375 returnNULL;
376 do 377 { 378 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
379 returnp;
380 p=p->next;
381 } 382 while(p!=(structsk_buff *)&s->receive_queue);
383 returnNULL;
384 } 385
386 /* 387 * Remove a completed connection and return it. This is used by 388 * tcp_accept() to get connections from the queue. 389 */ 390
391 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 392 { 393 structsk_buff *skb;
394 unsignedlongflags;
395 save_flags(flags);
396 cli();
397 skb=tcp_find_established(s);
398 if(skb!=NULL)
399 skb_unlink(skb); /* Take it off the queue */ 400 restore_flags(flags);
401 returnskb;
402 } 403
404 /* 405 * This routine closes sockets which have been at least partially 406 * opened, but not yet accepted. Currently it is only called by 407 * tcp_close, and timeout mirrors the value there. 408 */ 409
410 staticvoidtcp_close_pending (structsock *sk)
/* */ 411 { 412 structsk_buff *skb;
413
414 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
415 { 416 skb->sk->dead=1;
417 tcp_close(skb->sk, 0);
418 kfree_skb(skb, FREE_READ);
419 } 420 return;
421 } 422
423 /* 424 * Enter the time wait state. 425 */ 426
427 staticvoidtcp_time_wait(structsock *sk)
/* */ 428 { 429 tcp_set_state(sk,TCP_TIME_WAIT);
430 sk->shutdown = SHUTDOWN_MASK;
431 if (!sk->dead)
432 sk->state_change(sk);
433 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
434 } 435
436 /* 437 * A socket has timed out on its send queue and wants to do a 438 * little retransmitting. Currently this means TCP. 439 */ 440
441 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 442 { 443 structsk_buff * skb;
444 structproto *prot;
445 structdevice *dev;
446 intct=0;
447 structrtable *rt;
448
449 prot = sk->prot;
450 skb = sk->send_head;
451
452 while (skb != NULL)
453 { 454 structtcphdr *th;
455 structiphdr *iph;
456 intsize;
457
458 dev = skb->dev;
459 IS_SKB(skb);
460 skb->when = jiffies;
461
462 /* 463 * Discard the surplus MAC header 464 */ 465
466 skb_pull(skb,((unsignedchar *)skb->ip_hdr)-skb->data);
467
468 /* 469 * In general it's OK just to use the old packet. However we 470 * need to use the current ack and window fields. Urg and 471 * urg_ptr could possibly stand to be updated as well, but we 472 * don't keep the necessary data. That shouldn't be a problem, 473 * if the other end is doing the right thing. Since we're 474 * changing the packet, we have to issue a new IP identifier. 475 */ 476
477 iph = (structiphdr *)skb->data;
478 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
479 size = ntohs(iph->tot_len) - (iph->ihl<<2);
480
481 /* 482 * Note: We ought to check for window limits here but 483 * currently this is done (less efficiently) elsewhere. 484 */ 485
486 iph->id = htons(ip_id_count++);
487 ip_send_check(iph);
488
489 /* 490 * Put a MAC header back on (may cause ARPing) 491 */ 492
493 if(skb->localroute)
494 rt=ip_rt_local(iph->daddr,NULL,NULL);
495 else 496 rt=ip_rt_route(iph->daddr,NULL,NULL);
497
498 if(rt==NULL) /* Deep poo */ 499 { 500 if(skb->sk)
501 { 502 skb->sk->err=ENETUNREACH;
503 skb->sk->error_report(skb->sk);
504 } 505 } 506 else 507 { 508 dev=rt->rt_dev;
509 skb->raddr=rt->rt_gateway;
510 if(skb->raddr==0)
511 skb->raddr=iph->daddr;
512 skb->dev=dev;
513 skb->arp=1;
514 if(dev->hard_header)
515 { 516 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
517 skb->arp=0;
518 } 519
520 /* 521 * This is not the right way to handle this. We have to 522 * issue an up to date window and ack report with this 523 * retransmit to keep the odd buggy tcp that relies on 524 * the fact BSD does this happy. 525 * We don't however need to recalculate the entire 526 * checksum, so someone wanting a small problem to play 527 * with might like to implement RFC1141/RFC1624 and speed 528 * this up by avoiding a full checksum. 529 */ 530
531 th->ack_seq = ntohl(sk->acked_seq);
532 th->window = ntohs(tcp_select_window(sk));
533 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
534
535 /* 536 * If the interface is (still) up and running, kick it. 537 */ 538
539 if (dev->flags & IFF_UP)
540 { 541 /* 542 * If the packet is still being sent by the device/protocol 543 * below then don't retransmit. This is both needed, and good - 544 * especially with connected mode AX.25 where it stops resends 545 * occurring of an as yet unsent anyway frame! 546 * We still add up the counts as the round trip time wants 547 * adjusting. 548 */ 549 if (sk && !skb_device_locked(skb))
550 { 551 /* Remove it from any existing driver queue first! */ 552 skb_unlink(skb);
553 /* Now queue it */ 554 ip_statistics.IpOutRequests++;
555 dev_queue_xmit(skb, dev, sk->priority);
556 } 557 } 558 } 559
560 /* 561 * Count retransmissions 562 */ 563
564 ct++;
565 sk->prot->retransmits ++;
566 tcp_statistics.TcpRetransSegs++;
567
568
569 /* 570 * Only one retransmit requested. 571 */ 572
573 if (!all)
574 break;
575
576 /* 577 * This should cut it off before we send too many packets. 578 */ 579
580 if (ct >= sk->cong_window)
581 break;
582 skb = skb->link3;
583 } 584 } 585
586 /* 587 * Reset the retransmission timer 588 */ 589
590 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 591 { 592 del_timer(&sk->retransmit_timer);
593 sk->ip_xmit_timeout = why;
594 if((int)when < 0)
595 { 596 when=3;
597 printk("Error: Negative timer in xmit_timer\n");
598 } 599 sk->retransmit_timer.expires=jiffies+when;
600 add_timer(&sk->retransmit_timer);
601 } 602
603 /* 604 * This is the normal code called for timeouts. It does the retransmission 605 * and then does backoff. tcp_do_retransmit is separated out because 606 * tcp_ack needs to send stuff from the retransmit queue without 607 * initiating a backoff. 608 */ 609
610
611 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 612 { 613 tcp_do_retransmit(sk, all);
614
615 /* 616 * Increase the timeout each time we retransmit. Note that 617 * we do not increase the rtt estimate. rto is initialized 618 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 619 * that doubling rto each time is the least we can get away with. 620 * In KA9Q, Karn uses this for the first few times, and then 621 * goes to quadratic. netBSD doubles, but only goes up to *64, 622 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 623 * defined in the protocol as the maximum possible RTT. I guess 624 * we'll have to use something other than TCP to talk to the 625 * University of Mars. 626 * 627 * PAWS allows us longer timeouts and large windows, so once 628 * implemented ftp to mars will work nicely. We will have to fix 629 * the 120 second clamps though! 630 */ 631
632 sk->retransmits++;
633 sk->prot->retransmits++;
634 sk->backoff++;
635 sk->rto = min(sk->rto << 1, 120*HZ);
636 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
637 } 638
639
640 /* 641 * A timer event has trigger a tcp retransmit timeout. The 642 * socket xmit queue is ready and set up to send. Because 643 * the ack receive code keeps the queue straight we do 644 * nothing clever here. 645 */ 646
647 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 648 { 649 if (all)
650 { 651 tcp_retransmit_time(sk, all);
652 return;
653 } 654
655 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 656 /* sk->ssthresh in theory can be zero. I guess that's OK */ 657 sk->cong_count = 0;
658
659 sk->cong_window = 1;
660
661 /* Do the actual retransmit. */ 662 tcp_retransmit_time(sk, all);
663 } 664
665 /* 666 * A write timeout has occurred. Process the after effects. 667 */ 668
669 staticinttcp_write_timeout(structsock *sk)
/* */ 670 { 671 /* 672 * Look for a 'soft' timeout. 673 */ 674 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
675 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
676 { 677 /* 678 * Attempt to recover if arp has changed (unlikely!) or 679 * a route has shifted (not supported prior to 1.3). 680 */ 681 arp_destroy (sk->daddr, 0);
682 /*ip_route_check (sk->daddr);*/ 683 } 684
685 /* 686 * Have we tried to SYN too many times (repent repent 8)) 687 */ 688
689 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
690 { 691 sk->err=ETIMEDOUT;
692 sk->error_report(sk);
693 del_timer(&sk->retransmit_timer);
694 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ 695 tcp_set_state(sk,TCP_CLOSE);
696 /* Don't FIN, we got nothing back */ 697 release_sock(sk);
698 return 0;
699 } 700 /* 701 * Has it gone just too far ? 702 */ 703 if (sk->retransmits > TCP_RETR2)
704 { 705 sk->err = ETIMEDOUT;
706 sk->error_report(sk);
707 del_timer(&sk->retransmit_timer);
708 /* 709 * Time wait the socket 710 */ 711 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
712 { 713 tcp_set_state(sk,TCP_TIME_WAIT);
714 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
715 } 716 else 717 { 718 /* 719 * Clean up time. 720 */ 721 tcp_set_state(sk, TCP_CLOSE);
722 release_sock(sk);
723 return 0;
724 } 725 } 726 return 1;
727 } 728
729 /* 730 * The TCP retransmit timer. This lacks a few small details. 731 * 732 * 1. An initial rtt timeout on the probe0 should cause what we can 733 * of the first write queue buffer to be split and sent. 734 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 735 * ETIMEDOUT if we know an additional 'soft' error caused this. 736 * tcp_err should save a 'soft error' for us. 737 */ 738
739 staticvoidretransmit_timer(unsignedlongdata)
/* */ 740 { 741 structsock *sk = (structsock*)data;
742 intwhy = sk->ip_xmit_timeout;
743
744 /* 745 * only process if socket is not in use 746 */ 747
748 cli();
749 if (sk->inuse || in_bh)
750 { 751 /* Try again in 1 second */ 752 sk->retransmit_timer.expires = jiffies+HZ;
753 add_timer(&sk->retransmit_timer);
754 sti();
755 return;
756 } 757
758 sk->inuse = 1;
759 sti();
760
761 /* Always see if we need to send an ack. */ 762
763 if (sk->ack_backlog && !sk->zapped)
764 { 765 sk->prot->read_wakeup (sk);
766 if (! sk->dead)
767 sk->data_ready(sk,0);
768 } 769
770 /* Now we need to figure out why the socket was on the timer. */ 771
772 switch (why)
773 { 774 /* Window probing */ 775 caseTIME_PROBE0:
776 tcp_send_probe0(sk);
777 tcp_write_timeout(sk);
778 break;
779 /* Retransmitting */ 780 caseTIME_WRITE:
781 /* It could be we got here because we needed to send an ack. 782 * So we need to check for that. 783 */ 784 { 785 structsk_buff *skb;
786 unsignedlongflags;
787
788 save_flags(flags);
789 cli();
790 skb = sk->send_head;
791 if (!skb)
792 { 793 restore_flags(flags);
794 } 795 else 796 { 797 /* 798 * Kicked by a delayed ack. Reset timer 799 * correctly now 800 */ 801 if (jiffies < skb->when + sk->rto)
802 { 803 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
804 restore_flags(flags);
805 break;
806 } 807 restore_flags(flags);
808 /* 809 * Retransmission 810 */ 811 sk->prot->retransmit (sk, 0);
812 tcp_write_timeout(sk);
813 } 814 break;
815 } 816 /* Sending Keepalives */ 817 caseTIME_KEEPOPEN:
818 /* 819 * this reset_timer() call is a hack, this is not 820 * how KEEPOPEN is supposed to work. 821 */ 822 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
823
824 /* Send something to keep the connection open. */ 825 if (sk->prot->write_wakeup)
826 sk->prot->write_wakeup (sk);
827 sk->retransmits++;
828 sk->prot->retransmits++;
829 tcp_write_timeout(sk);
830 break;
831 default:
832 printk ("rexmit_timer: timer expired - reason unknown\n");
833 break;
834 } 835 release_sock(sk);
836 } 837
838 /* 839 * This routine is called by the ICMP module when it gets some 840 * sort of error condition. If err < 0 then the socket should 841 * be closed and the error returned to the user. If err > 0 842 * it's just the icmp type << 8 | icmp code. After adjustment 843 * header points to the first 8 bytes of the tcp header. We need 844 * to find the appropriate port. 845 */ 846
847 voidtcp_err(interr, unsignedchar *header, unsignedlongdaddr,
/* */ 848 unsignedlongsaddr, structinet_protocol *protocol)
849 { 850 structtcphdr *th;
851 structsock *sk;
852 structiphdr *iph=(structiphdr *)header;
853
854 header+=4*iph->ihl;
855
856
857 th =(structtcphdr *)header;
858 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
859
860 if (sk == NULL)
861 return;
862
863 if(err<0)
864 { 865 sk->err = -err;
866 sk->error_report(sk);
867 return;
868 } 869
870 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
871 { 872 /* 873 * FIXME: 874 * For now we will just trigger a linear backoff. 875 * The slow start code should cause a real backoff here. 876 */ 877 if (sk->cong_window > 4)
878 sk->cong_window--;
879 return;
880 } 881
882 /* 883 * If we've already connected we will keep trying 884 * until we time out, or the user gives up. 885 */ 886
887 err &= 0xff;
888 if (err < 13 && (icmp_err_convert[err].fatal || sk->state == TCP_SYN_SENT))
889 { 890 sk->err = icmp_err_convert[err].errno;
891 if (sk->state == TCP_SYN_SENT)
892 { 893 tcp_statistics.TcpAttemptFails++;
894 tcp_set_state(sk,TCP_CLOSE);
895 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 896 } 897 } 898 return;
899 } 900
901
902 /* 903 * Walk down the receive queue counting readable data until we hit the end or we find a gap 904 * in the received data queue (ie a frame missing that needs sending to us). Not 905 * sorting using two queues as data arrives makes life so much harder. 906 */ 907
908 staticinttcp_readable(structsock *sk)
/* */ 909 { 910 unsignedlongcounted;
911 unsignedlongamount;
912 structsk_buff *skb;
913 intsum;
914 unsignedlongflags;
915
916 if(sk && sk->debug)
917 printk("tcp_readable: %p - ",sk);
918
919 save_flags(flags);
920 cli();
921 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
922 { 923 restore_flags(flags);
924 if(sk && sk->debug)
925 printk("empty\n");
926 return(0);
927 } 928
929 counted = sk->copied_seq; /* Where we are at the moment */ 930 amount = 0;
931
932 /* 933 * Do until a push or until we are out of data. 934 */ 935
936 do 937 { 938 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ 939 break;
940 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 941 if (skb->h.th->syn)
942 sum++;
943 if (sum > 0)
944 {/* Add it up, move on */ 945 amount += sum;
946 if (skb->h.th->syn)
947 amount--;
948 counted += sum;
949 } 950 /* 951 * Don't count urg data ... but do it in the right place! 952 * Consider: "old_data (ptr is here) URG PUSH data" 953 * The old code would stop at the first push because 954 * it counted the urg (amount==1) and then does amount-- 955 * *after* the loop. This means tcp_readable() always 956 * returned zero if any URG PUSH was in the queue, even 957 * though there was normal data available. If we subtract 958 * the urg data right here, we even get it to work for more 959 * than one URG PUSH skb without normal data. 960 * This means that select() finally works now with urg data 961 * in the queue. Note that rlogin was never affected 962 * because it doesn't use select(); it uses two processes 963 * and a blocking read(). And the queue scan in tcp_read() 964 * was correct. Mike <pall@rz.uni-karlsruhe.de> 965 */ 966 if (skb->h.th->urg)
967 amount--; /* don't count urg data */ 968 if (amount && skb->h.th->psh) break;
969 skb = skb->next;
970 } 971 while(skb != (structsk_buff *)&sk->receive_queue);
972
973 restore_flags(flags);
974 if(sk->debug)
975 printk("got %lu bytes.\n",amount);
976 return(amount);
977 } 978
979 /* 980 * LISTEN is a special case for select.. 981 */ 982 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 983 { 984 if (sel_type == SEL_IN) { 985 intretval;
986
987 sk->inuse = 1;
988 retval = (tcp_find_established(sk) != NULL);
989 release_sock(sk);
990 if (!retval)
991 select_wait(&master_select_wakeup,wait);
992 returnretval;
993 } 994 return 0;
995 } 996
997
998 /* 999 * Wait for a TCP event.1000 *1001 * Note that we don't need to set "sk->inuse", as the upper select layers1002 * take care of normal races (between the test and the event) and we don't1003 * go look at any of the socket buffers directly.1004 */1005 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */1006 {1007 if (sk->state == TCP_LISTEN)
1008 returntcp_listen_select(sk, sel_type, wait);
1009
1010 switch(sel_type) {1011 caseSEL_IN:
1012 if (sk->err)
1013 return 1;
1014 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1015 break;
1016
1017 if (sk->shutdown & RCV_SHUTDOWN)
1018 return 1;
1019
1020 if (sk->acked_seq == sk->copied_seq)
1021 break;
1022
1023 if (sk->urg_seq != sk->copied_seq ||
1024 sk->acked_seq != sk->copied_seq+1 ||
1025 sk->urginline || !sk->urg_data)
1026 return 1;
1027 break;
1028
1029 caseSEL_OUT:
1030 if (sk->err)
1031 return 1;
1032 if (sk->shutdown & SEND_SHUTDOWN)
1033 return 0;
1034 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1035 break;
1036 /*1037 * This is now right thanks to a small fix1038 * by Matt Dillon.1039 */1040
1041 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1042 break;
1043 return 1;
1044
1045 caseSEL_EX:
1046 if (sk->urg_data)
1047 return 1;
1048 break;
1049 }1050 select_wait(sk->sleep, wait);
1051 return 0;
1052 }1053
1054 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */1055 {1056 interr;
1057 switch(cmd)
1058 {1059
1060 caseTIOCINQ:
1061 #ifdef FIXME /* FIXME: */1062 caseFIONREAD:
1063 #endif1064 {1065 unsignedlongamount;
1066
1067 if (sk->state == TCP_LISTEN)
1068 return(-EINVAL);
1069
1070 sk->inuse = 1;
1071 amount = tcp_readable(sk);
1072 release_sock(sk);
1073 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1074 if(err)
1075 returnerr;
1076 put_user(amount, (int *)arg);
1077 return(0);
1078 }1079 caseSIOCATMARK:
1080 {1081 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1082
1083 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1084 if (err)
1085 returnerr;
1086 put_user(answ,(int *) arg);
1087 return(0);
1088 }1089 caseTIOCOUTQ:
1090 {1091 unsignedlongamount;
1092
1093 if (sk->state == TCP_LISTEN) return(-EINVAL);
1094 amount = sk->prot->wspace(sk);
1095 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1096 if(err)
1097 returnerr;
1098 put_user(amount, (int *)arg);
1099 return(0);
1100 }1101 default:
1102 return(-EINVAL);
1103 }1104 }1105
1106
1107 /*1108 * This routine computes a TCP checksum. 1109 *1110 * Modified January 1995 from a go-faster DOS routine by1111 * Jorge Cwik <jorge@laser.satlink.net>1112 */1113
1114 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1115 unsignedlongsaddr, unsignedlongdaddr, unsignedlongbase)
1116 {1117 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1118 }1119
1120
1121
1122 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1123 unsignedlongdaddr, intlen, structsock *sk)
1124 {1125 th->check = 0;
1126 th->check = tcp_check(th, len, saddr, daddr,
1127 csum_partial((char *)th,len,0));
1128 return;
1129 }1130
1131 /*1132 * This is the main buffer sending routine. We queue the buffer1133 * having checked it is sane seeming.1134 */1135
1136 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1137 {1138 intsize;
1139 structtcphdr * th = skb->h.th;
1140
1141 /*1142 * length of packet (not counting length of pre-tcp headers) 1143 */1144
1145 size = skb->len - ((unsignedchar *) th - skb->data);
1146
1147 /*1148 * Sanity check it.. 1149 */1150
1151 if (size < sizeof(structtcphdr) || size > skb->len)
1152 {1153 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1154 skb, skb->data, th, skb->len);
1155 kfree_skb(skb, FREE_WRITE);
1156 return;
1157 }1158
1159 /*1160 * If we have queued a header size packet.. (these crash a few1161 * tcp stacks if ack is not set)1162 */1163
1164 if (size == sizeof(structtcphdr))
1165 {1166 /* If it's got a syn or fin it's notionally included in the size..*/1167 if(!th->syn && !th->fin)
1168 {1169 printk("tcp_send_skb: attempt to queue a bogon.\n");
1170 kfree_skb(skb,FREE_WRITE);
1171 return;
1172 }1173 }1174
1175 /*1176 * Actual processing.1177 */1178
1179 tcp_statistics.TcpOutSegs++;
1180 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1181
1182 /*1183 * We must queue if1184 *1185 * a) The right edge of this frame exceeds the window1186 * b) We are retransmitting (Nagle's rule)1187 * c) We have too many packets 'in flight'1188 */1189
1190 if (after(skb->h.seq, sk->window_seq) ||
1191 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1192 sk->packets_out >= sk->cong_window)
1193 {1194 /* checksum will be supplied by tcp_write_xmit. So1195 * we shouldn't need to set it at all. I'm being paranoid */1196 th->check = 0;
1197 if (skb->next != NULL)
1198 {1199 printk("tcp_send_partial: next != NULL\n");
1200 skb_unlink(skb);
1201 }1202 skb_queue_tail(&sk->write_queue, skb);
1203
1204 /*1205 * If we don't fit we have to start the zero window1206 * probes. This is broken - we really need to do a partial1207 * send _first_ (This is what causes the Cisco and PC/TCP1208 * grief).1209 */1210
1211 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1212 sk->send_head == NULL && sk->ack_backlog == 0)
1213 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1214 }1215 else1216 {1217 /*1218 * This is going straight out1219 */1220
1221 th->ack_seq = ntohl(sk->acked_seq);
1222 th->window = ntohs(tcp_select_window(sk));
1223
1224 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1225
1226 sk->sent_seq = sk->write_seq;
1227
1228 /*1229 * This is mad. The tcp retransmit queue is put together1230 * by the ip layer. This causes half the problems with1231 * unroutable FIN's and other things.1232 */1233
1234 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1235
1236 /*1237 * Set for next retransmit based on expected ACK time.1238 * FIXME: We set this every time which means our 1239 * retransmits are really about a window behind.1240 */1241
1242 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1243 }1244 }1245
1246 /*1247 * Locking problems lead us to a messy situation where we can have1248 * multiple partially complete buffers queued up. This is really bad1249 * as we don't want to be sending partial buffers. Fix this with1250 * a semaphore or similar to lock tcp_write per socket.1251 *1252 * These routines are pretty self descriptive.1253 */1254
1255 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1256 {1257 structsk_buff * skb;
1258 unsignedlongflags;
1259
1260 save_flags(flags);
1261 cli();
1262 skb = sk->partial;
1263 if (skb) {1264 sk->partial = NULL;
1265 del_timer(&sk->partial_timer);
1266 }1267 restore_flags(flags);
1268 returnskb;
1269 }1270
1271 /*1272 * Empty the partial queue1273 */1274
1275 staticvoidtcp_send_partial(structsock *sk)
/* */1276 {1277 structsk_buff *skb;
1278
1279 if (sk == NULL)
1280 return;
1281 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1282 tcp_send_skb(sk, skb);
1283 }1284
1285 /*1286 * Queue a partial frame1287 */1288
1289 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1290 {1291 structsk_buff * tmp;
1292 unsignedlongflags;
1293
1294 save_flags(flags);
1295 cli();
1296 tmp = sk->partial;
1297 if (tmp)
1298 del_timer(&sk->partial_timer);
1299 sk->partial = skb;
1300 init_timer(&sk->partial_timer);
1301 /*1302 * Wait up to 1 second for the buffer to fill.1303 */1304 sk->partial_timer.expires = jiffies+HZ;
1305 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1306 sk->partial_timer.data = (unsignedlong) sk;
1307 add_timer(&sk->partial_timer);
1308 restore_flags(flags);
1309 if (tmp)
1310 tcp_send_skb(sk, tmp);
1311 }1312
1313
1314 /*1315 * This routine sends an ack and also updates the window. 1316 */1317
1318 staticvoidtcp_send_ack(u32sequence, u32ack,
/* */1319 structsock *sk,
1320 structtcphdr *th, unsignedlongdaddr)
1321 {1322 structsk_buff *buff;
1323 structtcphdr *t1;
1324 structdevice *dev = NULL;
1325 inttmp;
1326
1327 if(sk->zapped)
1328 return; /* We have been reset, we may not send again */1329
1330 /*1331 * We need to grab some memory, and put together an ack,1332 * and then put it into the queue to be sent.1333 */1334
1335 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1336 if (buff == NULL)
1337 {1338 /* 1339 * Force it to send an ack. We don't have to do this1340 * (ACK is unreliable) but it's much better use of 1341 * bandwidth on slow links to send a spare ack than1342 * resend packets. 1343 */1344
1345 sk->ack_backlog++;
1346 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1347 {1348 reset_xmit_timer(sk, TIME_WRITE, HZ);
1349 }1350 return;
1351 }1352
1353 /*1354 * Assemble a suitable TCP frame1355 */1356
1357 buff->sk = sk;
1358 buff->localroute = sk->localroute;
1359
1360 /* 1361 * Put in the IP header and routing stuff. 1362 */1363
1364 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1365 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1366 if (tmp < 0)
1367 {1368 buff->free = 1;
1369 sk->prot->wfree(sk, buff);
1370 return;
1371 }1372 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1373
1374 memcpy(t1, th, sizeof(*t1));
1375
1376 /*1377 * Swap the send and the receive. 1378 */1379
1380 t1->dest = th->source;
1381 t1->source = th->dest;
1382 t1->seq = ntohl(sequence);
1383 t1->ack = 1;
1384 sk->window = tcp_select_window(sk);
1385 t1->window = ntohs(sk->window);
1386 t1->res1 = 0;
1387 t1->res2 = 0;
1388 t1->rst = 0;
1389 t1->urg = 0;
1390 t1->syn = 0;
1391 t1->psh = 0;
1392 t1->fin = 0;
1393
1394 /*1395 * If we have nothing queued for transmit and the transmit timer1396 * is on we are just doing an ACK timeout and need to switch1397 * to a keepalive.1398 */1399
1400 if (ack == sk->acked_seq)
1401 {1402 sk->ack_backlog = 0;
1403 sk->bytes_rcv = 0;
1404 sk->ack_timed = 0;
1405 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1406 && sk->ip_xmit_timeout == TIME_WRITE)
1407 {1408 if(sk->keepopen) {1409 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1410 }else{1411 delete_timer(sk);
1412 }1413 }1414 }1415
1416 /*1417 * Fill in the packet and send it1418 */1419
1420 t1->ack_seq = ntohl(ack);
1421 t1->doff = sizeof(*t1)/4;
1422 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1423 if (sk->debug)
1424 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1425 tcp_statistics.TcpOutSegs++;
1426 sk->prot->queue_xmit(sk, dev, buff, 1);
1427 }1428
1429
1430 /* 1431 * This routine builds a generic TCP header. 1432 */1433
1434 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1435 {1436
1437 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1438 th->seq = htonl(sk->write_seq);
1439 th->psh =(push == 0) ? 1 : 0;
1440 th->doff = sizeof(*th)/4;
1441 th->ack = 1;
1442 th->fin = 0;
1443 sk->ack_backlog = 0;
1444 sk->bytes_rcv = 0;
1445 sk->ack_timed = 0;
1446 th->ack_seq = htonl(sk->acked_seq);
1447 sk->window = tcp_select_window(sk);
1448 th->window = htons(sk->window);
1449
1450 return(sizeof(*th));
1451 }1452
1453 /*1454 * This routine copies from a user buffer into a socket,1455 * and starts the transmit system.1456 */1457
1458 staticinttcp_write(structsock *sk, constunsignedchar *from,
/* */1459 intlen, intnonblock, unsignedflags)
1460 {1461 intcopied = 0;
1462 intcopy;
1463 inttmp;
1464 structsk_buff *skb;
1465 structsk_buff *send_tmp;
1466 structproto *prot;
1467 structdevice *dev = NULL;
1468
1469 sk->inuse=1;
1470 prot = sk->prot;
1471 while(len > 0)
1472 {1473 if (sk->err)
1474 {/* Stop on an error */1475 release_sock(sk);
1476 if (copied)
1477 return(copied);
1478 tmp = -sk->err;
1479 sk->err = 0;
1480 return(tmp);
1481 }1482
1483 /*1484 * First thing we do is make sure that we are established. 1485 */1486
1487 if (sk->shutdown & SEND_SHUTDOWN)
1488 {1489 release_sock(sk);
1490 sk->err = EPIPE;
1491 if (copied)
1492 return(copied);
1493 sk->err = 0;
1494 return(-EPIPE);
1495 }1496
1497 /* 1498 * Wait for a connection to finish.1499 */1500
1501 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1502 {1503 if (sk->err)
1504 {1505 release_sock(sk);
1506 if (copied)
1507 return(copied);
1508 tmp = -sk->err;
1509 sk->err = 0;
1510 return(tmp);
1511 }1512
1513 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1514 {1515 release_sock(sk);
1516 if (copied)
1517 return(copied);
1518
1519 if (sk->err)
1520 {1521 tmp = -sk->err;
1522 sk->err = 0;
1523 return(tmp);
1524 }1525
1526 if (sk->keepopen)
1527 {1528 send_sig(SIGPIPE, current, 0);
1529 }1530 return(-EPIPE);
1531 }1532
1533 if (nonblock || copied)
1534 {1535 release_sock(sk);
1536 if (copied)
1537 return(copied);
1538 return(-EAGAIN);
1539 }1540
1541 release_sock(sk);
1542 cli();
1543
1544 if (sk->state != TCP_ESTABLISHED &&
1545 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1546 {1547 interruptible_sleep_on(sk->sleep);
1548 if (current->signal & ~current->blocked)
1549 {1550 sti();
1551 if (copied)
1552 return(copied);
1553 return(-ERESTARTSYS);
1554 }1555 }1556 sk->inuse = 1;
1557 sti();
1558 }1559
1560 /*1561 * The following code can result in copy <= if sk->mss is ever1562 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1563 * sk->mtu is constant once SYN processing is finished. I.e. we1564 * had better not get here until we've seen his SYN and at least one1565 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1566 * But ESTABLISHED should guarantee that. sk->max_window is by definition1567 * non-decreasing. Note that any ioctl to set user_mss must be done1568 * before the exchange of SYN's. If the initial ack from the other1569 * end has a window of 0, max_window and thus mss will both be 0.1570 */1571
1572 /* 1573 * Now we need to check if we have a half built packet. 1574 */1575
1576 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1577 {1578 inthdrlen;
1579
1580 /* IP header + TCP header */1581 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1582 + sizeof(structtcphdr);
1583
1584 /* Add more stuff to the end of skb->len */1585 if (!(flags & MSG_OOB))
1586 {1587 copy = min(sk->mss - (skb->len - hdrlen), len);
1588 /* FIXME: this is really a bug. */1589 if (copy <= 0)
1590 {1591 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1592 copy = 0;
1593 }1594
1595 memcpy_fromfs(skb_put(skb,copy), from, copy);
1596 from += copy;
1597 copied += copy;
1598 len -= copy;
1599 sk->write_seq += copy;
1600 }1601 if ((skb->len - hdrlen) >= sk->mss ||
1602 (flags & MSG_OOB) || !sk->packets_out)
1603 tcp_send_skb(sk, skb);
1604 else1605 tcp_enqueue_partial(skb, sk);
1606 continue;
1607 }1608
1609 /*1610 * We also need to worry about the window.1611 * If window < 1/2 the maximum window we've seen from this1612 * host, don't use it. This is sender side1613 * silly window prevention, as specified in RFC1122.1614 * (Note that this is different than earlier versions of1615 * SWS prevention, e.g. RFC813.). What we actually do is 1616 * use the whole MSS. Since the results in the right1617 * edge of the packet being outside the window, it will1618 * be queued for later rather than sent.1619 */1620
1621 copy = sk->window_seq - sk->write_seq;
1622 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1623 copy = sk->mss;
1624 if (copy > len)
1625 copy = len;
1626
1627 /*1628 * We should really check the window here also. 1629 */1630
1631 send_tmp = NULL;
1632 if (copy < sk->mss && !(flags & MSG_OOB))
1633 {1634 /*1635 * We will release the socket in case we sleep here. 1636 */1637 release_sock(sk);
1638 /*1639 * NB: following must be mtu, because mss can be increased.1640 * mss is always <= mtu 1641 */1642 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1643 sk->inuse = 1;
1644 send_tmp = skb;
1645 }1646 else1647 {1648 /*1649 * We will release the socket in case we sleep here. 1650 */1651 release_sock(sk);
1652 skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1653 sk->inuse = 1;
1654 }1655
1656 /*1657 * If we didn't get any memory, we need to sleep. 1658 */1659
1660 if (skb == NULL)
1661 {1662 sk->socket->flags |= SO_NOSPACE;
1663 if (nonblock)
1664 {1665 release_sock(sk);
1666 if (copied)
1667 return(copied);
1668 return(-EAGAIN);
1669 }1670
1671 /*1672 * FIXME: here is another race condition. 1673 */1674
1675 tmp = sk->wmem_alloc;
1676 release_sock(sk);
1677 cli();
1678 /*1679 * Again we will try to avoid it. 1680 */1681 if (tmp <= sk->wmem_alloc &&
1682 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1683 && sk->err == 0)
1684 {1685 sk->socket->flags &= ~SO_NOSPACE;
1686 interruptible_sleep_on(sk->sleep);
1687 if (current->signal & ~current->blocked)
1688 {1689 sti();
1690 if (copied)
1691 return(copied);
1692 return(-ERESTARTSYS);
1693 }1694 }1695 sk->inuse = 1;
1696 sti();
1697 continue;
1698 }1699
1700 skb->sk = sk;
1701 skb->free = 0;
1702 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1703
1704 /*1705 * FIXME: we need to optimize this.1706 * Perhaps some hints here would be good.1707 */1708
1709 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1710 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1711 if (tmp < 0 )
1712 {1713 prot->wfree(sk, skb);
1714 release_sock(sk);
1715 if (copied)
1716 return(copied);
1717 return(tmp);
1718 }1719 skb->dev = dev;
1720 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
1721 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1722 if (tmp < 0)
1723 {1724 prot->wfree(sk, skb);
1725 release_sock(sk);
1726 if (copied)
1727 return(copied);
1728 return(tmp);
1729 }1730
1731 if (flags & MSG_OOB)
1732 {1733 skb->h.th->urg = 1;
1734 skb->h.th->urg_ptr = ntohs(copy);
1735 }1736
1737 memcpy_fromfs(skb_put(skb,copy), from, copy);
1738
1739 from += copy;
1740 copied += copy;
1741 len -= copy;
1742 skb->free = 0;
1743 sk->write_seq += copy;
1744
1745 if (send_tmp != NULL && sk->packets_out)
1746 {1747 tcp_enqueue_partial(send_tmp, sk);
1748 continue;
1749 }1750 tcp_send_skb(sk, skb);
1751 }1752 sk->err = 0;
1753
1754 /*1755 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1756 * interactive fast network servers. It's meant to be on and1757 * it really improves the throughput though not the echo time1758 * on my slow slip link - Alan1759 */1760
1761 /*1762 * Avoid possible race on send_tmp - c/o Johannes Stille 1763 */1764
1765 if(sk->partial && ((!sk->packets_out)
1766 /* If not nagling we can send on the before case too.. */1767 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1768 ))
1769 tcp_send_partial(sk);
1770
1771 release_sock(sk);
1772 return(copied);
1773 }1774
1775 /*1776 * This is just a wrapper. 1777 */1778
1779 staticinttcp_sendto(structsock *sk, constunsignedchar *from,
/* */1780 intlen, intnonblock, unsignedflags,
1781 structsockaddr_in *addr, intaddr_len)
1782 {1783 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1784 return -EINVAL;
1785 if (sk->state == TCP_CLOSE)
1786 return -ENOTCONN;
1787 if (addr_len < sizeof(*addr))
1788 return -EINVAL;
1789 if (addr->sin_family && addr->sin_family != AF_INET)
1790 return -EINVAL;
1791 if (addr->sin_port != sk->dummy_th.dest)
1792 return -EISCONN;
1793 if (addr->sin_addr.s_addr != sk->daddr)
1794 return -EISCONN;
1795 returntcp_write(sk, from, len, nonblock, flags);
1796 }1797
1798
1799 /*1800 * Send an ack if one is backlogged at this point. Ought to merge1801 * this with tcp_send_ack().1802 */1803
1804 staticvoidtcp_read_wakeup(structsock *sk)
/* */1805 {1806 inttmp;
1807 structdevice *dev = NULL;
1808 structtcphdr *t1;
1809 structsk_buff *buff;
1810
1811 if (!sk->ack_backlog)
1812 return;
1813
1814 /*1815 * If we're closed, don't send an ack, or we'll get a RST1816 * from the closed destination.1817 */1818 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1819 return;
1820
1821 /*1822 * FIXME: we need to put code here to prevent this routine from1823 * being called. Being called once in a while is ok, so only check1824 * if this is the second time in a row.1825 */1826
1827 /*1828 * We need to grab some memory, and put together an ack,1829 * and then put it into the queue to be sent.1830 */1831
1832 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1833 if (buff == NULL)
1834 {1835 /* Try again real soon. */1836 reset_xmit_timer(sk, TIME_WRITE, HZ);
1837 return;
1838 }1839
1840 buff->sk = sk;
1841 buff->localroute = sk->localroute;
1842
1843 /*1844 * Put in the IP header and routing stuff. 1845 */1846
1847 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1848 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1849 if (tmp < 0)
1850 {1851 buff->free = 1;
1852 sk->prot->wfree(sk, buff);
1853 return;
1854 }1855
1856 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1857
1858 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1859 t1->seq = htonl(sk->sent_seq);
1860 t1->ack = 1;
1861 t1->res1 = 0;
1862 t1->res2 = 0;
1863 t1->rst = 0;
1864 t1->urg = 0;
1865 t1->syn = 0;
1866 t1->psh = 0;
1867 sk->ack_backlog = 0;
1868 sk->bytes_rcv = 0;
1869 sk->window = tcp_select_window(sk);
1870 t1->window = ntohs(sk->window);
1871 t1->ack_seq = ntohl(sk->acked_seq);
1872 t1->doff = sizeof(*t1)/4;
1873 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1874 sk->prot->queue_xmit(sk, dev, buff, 1);
1875 tcp_statistics.TcpOutSegs++;
1876 }1877
1878
1879 /*1880 * FIXME:1881 * This routine frees used buffers.1882 * It should consider sending an ACK to let the1883 * other end know we now have a bigger window.1884 */1885
1886 staticvoidcleanup_rbuf(structsock *sk)
/* */1887 {1888 unsignedlongflags;
1889 unsignedlongleft;
1890 structsk_buff *skb;
1891 unsignedlongrspace;
1892
1893 if(sk->debug)
1894 printk("cleaning rbuf for sk=%p\n", sk);
1895
1896 save_flags(flags);
1897 cli();
1898
1899 left = sk->prot->rspace(sk);
1900
1901 /*1902 * We have to loop through all the buffer headers,1903 * and try to free up all the space we can.1904 */1905
1906 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1907 {1908 if (!skb->used || skb->users)
1909 break;
1910 skb_unlink(skb);
1911 skb->sk = sk;
1912 kfree_skb(skb, FREE_READ);
1913 }1914
1915 restore_flags(flags);
1916
1917 /*1918 * FIXME:1919 * At this point we should send an ack if the difference1920 * in the window, and the amount of space is bigger than1921 * TCP_WINDOW_DIFF.1922 */1923
1924 if(sk->debug)
1925 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1926 left);
1927 if ((rspace=sk->prot->rspace(sk)) != left)
1928 {1929 /*1930 * This area has caused the most trouble. The current strategy1931 * is to simply do nothing if the other end has room to send at1932 * least 3 full packets, because the ack from those will auto-1933 * matically update the window. If the other end doesn't think1934 * we have much space left, but we have room for at least 1 more1935 * complete packet than it thinks we do, we will send an ack1936 * immediately. Otherwise we will wait up to .5 seconds in case1937 * the user reads some more.1938 */1939 sk->ack_backlog++;
1940 /*1941 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1942 * if the other end is offering a window smaller than the agreed on MSS1943 * (called sk->mtu here). In theory there's no connection between send1944 * and receive, and so no reason to think that they're going to send1945 * small packets. For the moment I'm using the hack of reducing the mss1946 * only on the send side, so I'm putting mtu here.1947 */1948
1949 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1950 {1951 /* Send an ack right now. */1952 tcp_read_wakeup(sk);
1953 }1954 else1955 {1956 /* Force it to send an ack soon. */1957 intwas_active = del_timer(&sk->retransmit_timer);
1958 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
1959 {1960 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1961 }1962 else1963 add_timer(&sk->retransmit_timer);
1964 }1965 }1966 }1967
1968
1969 /*1970 * Handle reading urgent data. BSD has very simple semantics for1971 * this, no blocking and very strange errors 8)1972 */1973
1974 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */1975 unsignedchar *to, intlen, unsignedflags)
1976 {1977 /*1978 * No URG data to read1979 */1980 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1981 return -EINVAL; /* Yes this is right ! */1982
1983 if (sk->err)
1984 {1985 inttmp = -sk->err;
1986 sk->err = 0;
1987 returntmp;
1988 }1989
1990 if (sk->state == TCP_CLOSE || sk->done)
1991 {1992 if (!sk->done) {1993 sk->done = 1;
1994 return 0;
1995 }1996 return -ENOTCONN;
1997 }1998
1999 if (sk->shutdown & RCV_SHUTDOWN)
2000 {2001 sk->done = 1;
2002 return 0;
2003 }2004 sk->inuse = 1;
2005 if (sk->urg_data & URG_VALID)
2006 {2007 charc = sk->urg_data;
2008 if (!(flags & MSG_PEEK))
2009 sk->urg_data = URG_READ;
2010 put_fs_byte(c, to);
2011 release_sock(sk);
2012 return 1;
2013 }2014 release_sock(sk);
2015
2016 /*2017 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2018 * the available implementations agree in this case:2019 * this call should never block, independent of the2020 * blocking state of the socket.2021 * Mike <pall@rz.uni-karlsruhe.de>2022 */2023 return -EAGAIN;
2024 }2025
2026
2027 /*2028 * This routine copies from a sock struct into the user buffer. 2029 */2030
2031 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */2032 intlen, intnonblock, unsignedflags)
2033 {2034 structwait_queuewait = {current, NULL};
2035 intcopied = 0;
2036 u32peek_seq;
2037 volatileu32 *seq; /* So gcc doesn't overoptimise */2038 unsignedlongused;
2039
2040 /* 2041 * This error should be checked. 2042 */2043
2044 if (sk->state == TCP_LISTEN)
2045 return -ENOTCONN;
2046
2047 /*2048 * Urgent data needs to be handled specially. 2049 */2050
2051 if (flags & MSG_OOB)
2052 returntcp_read_urg(sk, nonblock, to, len, flags);
2053
2054 /*2055 * Copying sequence to update. This is volatile to handle2056 * the multi-reader case neatly (memcpy_to/fromfs might be 2057 * inline and thus not flush cached variables otherwise).2058 */2059
2060 peek_seq = sk->copied_seq;
2061 seq = &sk->copied_seq;
2062 if (flags & MSG_PEEK)
2063 seq = &peek_seq;
2064
2065 add_wait_queue(sk->sleep, &wait);
2066 sk->inuse = 1;
2067 while (len > 0)
2068 {2069 structsk_buff * skb;
2070 u32offset;
2071
2072 /*2073 * Are we at urgent data? Stop if we have read anything.2074 */2075
2076 if (copied && sk->urg_data && sk->urg_seq == *seq)
2077 break;
2078
2079 /*2080 * Next get a buffer.2081 */2082
2083 current->state = TASK_INTERRUPTIBLE;
2084
2085 skb = skb_peek(&sk->receive_queue);
2086 do2087 {2088 if (!skb)
2089 break;
2090 if (before(*seq, skb->h.th->seq))
2091 break;
2092 offset = *seq - skb->h.th->seq;
2093 if (skb->h.th->syn)
2094 offset--;
2095 if (offset < skb->len)
2096 gotofound_ok_skb;
2097 if (skb->h.th->fin)
2098 gotofound_fin_ok;
2099 if (!(flags & MSG_PEEK))
2100 skb->used = 1;
2101 skb = skb->next;
2102 }2103 while (skb != (structsk_buff *)&sk->receive_queue);
2104
2105 if (copied)
2106 break;
2107
2108 if (sk->err)
2109 {2110 copied = -sk->err;
2111 sk->err = 0;
2112 break;
2113 }2114
2115 if (sk->state == TCP_CLOSE)
2116 {2117 if (!sk->done)
2118 {2119 sk->done = 1;
2120 break;
2121 }2122 copied = -ENOTCONN;
2123 break;
2124 }2125
2126 if (sk->shutdown & RCV_SHUTDOWN)
2127 {2128 sk->done = 1;
2129 break;
2130 }2131
2132 if (nonblock)
2133 {2134 copied = -EAGAIN;
2135 break;
2136 }2137
2138 cleanup_rbuf(sk);
2139 release_sock(sk);
2140 sk->socket->flags |= SO_WAITDATA;
2141 schedule();
2142 sk->socket->flags &= ~SO_WAITDATA;
2143 sk->inuse = 1;
2144
2145 if (current->signal & ~current->blocked)
2146 {2147 copied = -ERESTARTSYS;
2148 break;
2149 }2150 continue;
2151
2152 found_ok_skb:
2153 /*2154 * Lock the buffer. We can be fairly relaxed as2155 * an interrupt will never steal a buffer we are 2156 * using unless I've missed something serious in2157 * tcp_data.2158 */2159
2160 skb->users++;
2161
2162 /*2163 * Ok so how much can we use ? 2164 */2165
2166 used = skb->len - offset;
2167 if (len < used)
2168 used = len;
2169 /*2170 * Do we have urgent data here? 2171 */2172
2173 if (sk->urg_data)
2174 {2175 u32urg_offset = sk->urg_seq - *seq;
2176 if (urg_offset < used)
2177 {2178 if (!urg_offset)
2179 {2180 if (!sk->urginline)
2181 {2182 ++*seq;
2183 offset++;
2184 used--;
2185 }2186 }2187 else2188 used = urg_offset;
2189 }2190 }2191
2192 /*2193 * Copy it - We _MUST_ update *seq first so that we2194 * don't ever double read when we have dual readers2195 */2196
2197 *seq += used;
2198
2199 /*2200 * This memcpy_tofs can sleep. If it sleeps and we2201 * do a second read it relies on the skb->users to avoid2202 * a crash when cleanup_rbuf() gets called.2203 */2204
2205 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2206 skb->h.th->doff*4 + offset, used);
2207 copied += used;
2208 len -= used;
2209 to += used;
2210
2211 /*2212 * We now will not sleep again until we are finished2213 * with skb. Sorry if you are doing the SMP port2214 * but you'll just have to fix it neatly ;)2215 */2216
2217 skb->users --;
2218
2219 if (after(sk->copied_seq,sk->urg_seq))
2220 sk->urg_data = 0;
2221 if (used + offset < skb->len)
2222 continue;
2223
2224 /*2225 * Process the FIN.2226 */2227
2228 if (skb->h.th->fin)
2229 gotofound_fin_ok;
2230 if (flags & MSG_PEEK)
2231 continue;
2232 skb->used = 1;
2233 continue;
2234
2235 found_fin_ok:
2236 ++*seq;
2237 if (flags & MSG_PEEK)
2238 break;
2239
2240 /*2241 * All is done2242 */2243
2244 skb->used = 1;
2245 sk->shutdown |= RCV_SHUTDOWN;
2246 break;
2247
2248 }2249 remove_wait_queue(sk->sleep, &wait);
2250 current->state = TASK_RUNNING;
2251
2252 /* Clean up data we have read: This will do ACK frames */2253 cleanup_rbuf(sk);
2254 release_sock(sk);
2255 returncopied;
2256 }2257
2258 /*2259 * State processing on a close. This implements the state shift for2260 * sending our FIN frame. Note that we only send a FIN for some 2261 * states. A shutdown() may have already sent the FIN, or we may be2262 * closed.2263 */2264
2265 staticinttcp_close_state(structsock *sk, intdead)
/* */2266 {2267 intns=TCP_CLOSE;
2268 intsend_fin=0;
2269 switch(sk->state)
2270 {2271 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2272 break;
2273 caseTCP_SYN_RECV:
2274 caseTCP_ESTABLISHED: /* Closedown begin */2275 ns=TCP_FIN_WAIT1;
2276 send_fin=1;
2277 break;
2278 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2279 caseTCP_FIN_WAIT2:
2280 caseTCP_CLOSING:
2281 ns=sk->state;
2282 break;
2283 caseTCP_CLOSE:
2284 caseTCP_LISTEN:
2285 break;
2286 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2287 wait only for the ACK */2288 ns=TCP_LAST_ACK;
2289 send_fin=1;
2290 }2291
2292 tcp_set_state(sk,ns);
2293
2294 /*2295 * This is a (useful) BSD violating of the RFC. There is a2296 * problem with TCP as specified in that the other end could2297 * keep a socket open forever with no application left this end.2298 * We use a 3 minute timeout (about the same as BSD) then kill2299 * our end. If they send after that then tough - BUT: long enough2300 * that we won't make the old 4*rto = almost no time - whoops2301 * reset mistake.2302 */2303 if(dead && ns==TCP_FIN_WAIT2)
2304 {2305 inttimer_active=del_timer(&sk->timer);
2306 if(timer_active)
2307 add_timer(&sk->timer);
2308 else2309 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2310 }2311
2312 returnsend_fin;
2313 }2314
2315 /*2316 * Send a fin.2317 */2318
2319 staticvoidtcp_send_fin(structsock *sk)
/* */2320 {2321 structproto *prot =(structproto *)sk->prot;
2322 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2323 structtcphdr *t1;
2324 structsk_buff *buff;
2325 structdevice *dev=NULL;
2326 inttmp;
2327
2328 release_sock(sk); /* in case the malloc sleeps. */2329
2330 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2331 sk->inuse = 1;
2332
2333 if (buff == NULL)
2334 {2335 /* This is a disaster if it occurs */2336 printk("tcp_send_fin: Impossible malloc failure");
2337 return;
2338 }2339
2340 /*2341 * Administrivia2342 */2343
2344 buff->sk = sk;
2345 buff->localroute = sk->localroute;
2346
2347 /*2348 * Put in the IP header and routing stuff. 2349 */2350
2351 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2352 IPPROTO_TCP, sk->opt,
2353 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2354 if (tmp < 0)
2355 {2356 intt;
2357 /*2358 * Finish anyway, treat this as a send that got lost. 2359 * (Not good).2360 */2361
2362 buff->free = 1;
2363 prot->wfree(sk,buff);
2364 sk->write_seq++;
2365 t=del_timer(&sk->timer);
2366 if(t)
2367 add_timer(&sk->timer);
2368 else2369 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2370 return;
2371 }2372
2373 /*2374 * We ought to check if the end of the queue is a buffer and2375 * if so simply add the fin to that buffer, not send it ahead.2376 */2377
2378 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2379 buff->dev = dev;
2380 memcpy(t1, th, sizeof(*t1));
2381 t1->seq = ntohl(sk->write_seq);
2382 sk->write_seq++;
2383 buff->h.seq = sk->write_seq;
2384 t1->ack = 1;
2385 t1->ack_seq = ntohl(sk->acked_seq);
2386 t1->window = ntohs(sk->window=tcp_select_window(sk));
2387 t1->fin = 1;
2388 t1->rst = 0;
2389 t1->doff = sizeof(*t1)/4;
2390 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2391
2392 /*2393 * If there is data in the write queue, the fin must be appended to2394 * the write queue.2395 */2396
2397 if (skb_peek(&sk->write_queue) != NULL)
2398 {2399 buff->free = 0;
2400 if (buff->next != NULL)
2401 {2402 printk("tcp_send_fin: next != NULL\n");
2403 skb_unlink(buff);
2404 }2405 skb_queue_tail(&sk->write_queue, buff);
2406 }2407 else2408 {2409 sk->sent_seq = sk->write_seq;
2410 sk->prot->queue_xmit(sk, dev, buff, 0);
2411 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2412 }2413 }2414
2415 /*2416 * Shutdown the sending side of a connection. Much like close except2417 * that we don't receive shut down or set sk->dead=1.2418 */2419
2420 voidtcp_shutdown(structsock *sk, inthow)
/* */2421 {2422 /*2423 * We need to grab some memory, and put together a FIN,2424 * and then put it into the queue to be sent.2425 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2426 */2427
2428 if (!(how & SEND_SHUTDOWN))
2429 return;
2430
2431 /*2432 * If we've already sent a FIN, or it's a closed state2433 */2434
2435 if (sk->state == TCP_FIN_WAIT1 ||
2436 sk->state == TCP_FIN_WAIT2 ||
2437 sk->state == TCP_CLOSING ||
2438 sk->state == TCP_LAST_ACK ||
2439 sk->state == TCP_TIME_WAIT ||
2440 sk->state == TCP_CLOSE ||
2441 sk->state == TCP_LISTEN2442 )
2443 {2444 return;
2445 }2446 sk->inuse = 1;
2447
2448 /*2449 * flag that the sender has shutdown2450 */2451
2452 sk->shutdown |= SEND_SHUTDOWN;
2453
2454 /*2455 * Clear out any half completed packets. 2456 */2457
2458 if (sk->partial)
2459 tcp_send_partial(sk);
2460
2461 /*2462 * FIN if needed2463 */2464
2465 if(tcp_close_state(sk,0))
2466 tcp_send_fin(sk);
2467
2468 release_sock(sk);
2469 }2470
2471
2472 staticint2473 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2474 intto_len, intnonblock, unsignedflags,
2475 structsockaddr_in *addr, int *addr_len)
2476 {2477 intresult;
2478
2479 /* 2480 * Have to check these first unlike the old code. If 2481 * we check them after we lose data on an error2482 * which is wrong 2483 */2484
2485 if(addr_len)
2486 *addr_len = sizeof(*addr);
2487 result=tcp_read(sk, to, to_len, nonblock, flags);
2488
2489 if (result < 0)
2490 return(result);
2491
2492 if(addr)
2493 {2494 addr->sin_family = AF_INET;
2495 addr->sin_port = sk->dummy_th.dest;
2496 addr->sin_addr.s_addr = sk->daddr;
2497 }2498 return(result);
2499 }2500
2501
2502 /*2503 * This routine will send an RST to the other tcp. 2504 */2505
2506 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2507 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2508 {2509 structsk_buff *buff;
2510 structtcphdr *t1;
2511 inttmp;
2512 structdevice *ndev=NULL;
2513
2514 /*2515 * Cannot reset a reset (Think about it).2516 */2517
2518 if(th->rst)
2519 return;
2520
2521 /*2522 * We need to grab some memory, and put together an RST,2523 * and then put it into the queue to be sent.2524 */2525
2526 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2527 if (buff == NULL)
2528 return;
2529
2530 buff->sk = NULL;
2531 buff->dev = dev;
2532 buff->localroute = 0;
2533
2534 /*2535 * Put in the IP header and routing stuff. 2536 */2537
2538 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2539 sizeof(structtcphdr),tos,ttl);
2540 if (tmp < 0)
2541 {2542 buff->free = 1;
2543 prot->wfree(NULL, buff);
2544 return;
2545 }2546
2547 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2548 memcpy(t1, th, sizeof(*t1));
2549
2550 /*2551 * Swap the send and the receive. 2552 */2553
2554 t1->dest = th->source;
2555 t1->source = th->dest;
2556 t1->rst = 1;
2557 t1->window = 0;
2558
2559 if(th->ack)
2560 {2561 t1->ack = 0;
2562 t1->seq = th->ack_seq;
2563 t1->ack_seq = 0;
2564 }2565 else2566 {2567 t1->ack = 1;
2568 if(!th->syn)
2569 t1->ack_seq=htonl(th->seq);
2570 else2571 t1->ack_seq=htonl(th->seq+1);
2572 t1->seq=0;
2573 }2574
2575 t1->syn = 0;
2576 t1->urg = 0;
2577 t1->fin = 0;
2578 t1->psh = 0;
2579 t1->doff = sizeof(*t1)/4;
2580 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2581 prot->queue_xmit(NULL, ndev, buff, 1);
2582 tcp_statistics.TcpOutSegs++;
2583 }2584
2585
2586 /*2587 * Look for tcp options. Parses everything but only knows about MSS.2588 * This routine is always called with the packet containing the SYN.2589 * However it may also be called with the ack to the SYN. So you2590 * can't assume this is always the SYN. It's always called after2591 * we have set up sk->mtu to our own MTU.2592 *2593 * We need at minimum to add PAWS support here. Possibly large windows2594 * as Linux gets deployed on 100Mb/sec networks.2595 */2596
2597 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2598 {2599 unsignedchar *ptr;
2600 intlength=(th->doff*4)-sizeof(structtcphdr);
2601 intmss_seen = 0;
2602
2603 ptr = (unsignedchar *)(th + 1);
2604
2605 while(length>0)
2606 {2607 intopcode=*ptr++;
2608 intopsize=*ptr++;
2609 switch(opcode)
2610 {2611 caseTCPOPT_EOL:
2612 return;
2613 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2614 length--;
2615 ptr--; /* the opsize=*ptr++ above was a mistake */2616 continue;
2617
2618 default:
2619 if(opsize<=2) /* Avoid silly options looping forever */2620 return;
2621 switch(opcode)
2622 {2623 caseTCPOPT_MSS:
2624 if(opsize==4 && th->syn)
2625 {2626 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2627 mss_seen = 1;
2628 }2629 break;
2630 /* Add other options here as people feel the urge to implement stuff like large windows */2631 }2632 ptr+=opsize-2;
2633 length-=opsize;
2634 }2635 }2636 if (th->syn)
2637 {2638 if (! mss_seen)
2639 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2640 }2641 #ifdefCONFIG_INET_PCTCP2642 sk->mss = min(sk->max_window >> 1, sk->mtu);
2643 #else2644 sk->mss = min(sk->max_window, sk->mtu);
2645 #endif2646 }2647
2648 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2649 {2650 dst = ntohl(dst);
2651 if (IN_CLASSA(dst))
2652 returnhtonl(IN_CLASSA_NET);
2653 if (IN_CLASSB(dst))
2654 returnhtonl(IN_CLASSB_NET);
2655 returnhtonl(IN_CLASSC_NET);
2656 }2657
2658 /*2659 * Default sequence number picking algorithm.2660 * As close as possible to RFC 793, which2661 * suggests using a 250kHz clock.2662 * Further reading shows this assumes 2MB/s networks.2663 * For 10MB/s ethernet, a 1MHz clock is appropriate.2664 * That's funny, Linux has one built in! Use it!2665 */2666
2667 externinlineu32tcp_init_seq(void)
/* */2668 {2669 structtimevaltv;
2670 do_gettimeofday(&tv);
2671 returntv.tv_usec+tv.tv_sec*1000000;
2672 }2673
2674 /*2675 * This routine handles a connection request.2676 * It should make sure we haven't already responded.2677 * Because of the way BSD works, we have to send a syn/ack now.2678 * This also means it will be harder to close a socket which is2679 * listening.2680 */2681
2682 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2683 unsignedlongdaddr, unsignedlongsaddr,
2684 structoptions *opt, structdevice *dev, u32seq)
2685 {2686 structsk_buff *buff;
2687 structtcphdr *t1;
2688 unsignedchar *ptr;
2689 structsock *newsk;
2690 structtcphdr *th;
2691 structdevice *ndev=NULL;
2692 inttmp;
2693 structrtable *rt;
2694
2695 th = skb->h.th;
2696
2697 /* If the socket is dead, don't accept the connection. */2698 if (!sk->dead)
2699 {2700 sk->data_ready(sk,0);
2701 }2702 else2703 {2704 if(sk->debug)
2705 printk("Reset on %p: Connect on dead socket.\n",sk);
2706 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2707 tcp_statistics.TcpAttemptFails++;
2708 kfree_skb(skb, FREE_READ);
2709 return;
2710 }2711
2712 /*2713 * Make sure we can accept more. This will prevent a2714 * flurry of syns from eating up all our memory.2715 */2716
2717 if (sk->ack_backlog >= sk->max_ack_backlog)
2718 {2719 tcp_statistics.TcpAttemptFails++;
2720 kfree_skb(skb, FREE_READ);
2721 return;
2722 }2723
2724 /*2725 * We need to build a new sock struct.2726 * It is sort of bad to have a socket without an inode attached2727 * to it, but the wake_up's will just wake up the listening socket,2728 * and if the listening socket is destroyed before this is taken2729 * off of the queue, this will take care of it.2730 */2731
2732 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2733 if (newsk == NULL)
2734 {2735 /* just ignore the syn. It will get retransmitted. */2736 tcp_statistics.TcpAttemptFails++;
2737 kfree_skb(skb, FREE_READ);
2738 return;
2739 }2740
2741 memcpy(newsk, sk, sizeof(*newsk));
2742 skb_queue_head_init(&newsk->write_queue);
2743 skb_queue_head_init(&newsk->receive_queue);
2744 newsk->send_head = NULL;
2745 newsk->send_tail = NULL;
2746 skb_queue_head_init(&newsk->back_log);
2747 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2748 newsk->rto = TCP_TIMEOUT_INIT;
2749 newsk->mdev = 0;
2750 newsk->max_window = 0;
2751 newsk->cong_window = 1;
2752 newsk->cong_count = 0;
2753 newsk->ssthresh = 0;
2754 newsk->backoff = 0;
2755 newsk->blog = 0;
2756 newsk->intr = 0;
2757 newsk->proc = 0;
2758 newsk->done = 0;
2759 newsk->partial = NULL;
2760 newsk->pair = NULL;
2761 newsk->wmem_alloc = 0;
2762 newsk->rmem_alloc = 0;
2763 newsk->localroute = sk->localroute;
2764
2765 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2766
2767 newsk->err = 0;
2768 newsk->shutdown = 0;
2769 newsk->ack_backlog = 0;
2770 newsk->acked_seq = skb->h.th->seq+1;
2771 newsk->copied_seq = skb->h.th->seq+1;
2772 newsk->fin_seq = skb->h.th->seq;
2773 newsk->state = TCP_SYN_RECV;
2774 newsk->timeout = 0;
2775 newsk->ip_xmit_timeout = 0;
2776 newsk->write_seq = seq;
2777 newsk->window_seq = newsk->write_seq;
2778 newsk->rcv_ack_seq = newsk->write_seq;
2779 newsk->urg_data = 0;
2780 newsk->retransmits = 0;
2781 newsk->linger=0;
2782 newsk->destroy = 0;
2783 init_timer(&newsk->timer);
2784 newsk->timer.data = (unsignedlong)newsk;
2785 newsk->timer.function = &net_timer;
2786 init_timer(&newsk->retransmit_timer);
2787 newsk->retransmit_timer.data = (unsignedlong)newsk;
2788 newsk->retransmit_timer.function=&retransmit_timer;
2789 newsk->dummy_th.source = skb->h.th->dest;
2790 newsk->dummy_th.dest = skb->h.th->source;
2791
2792 /*2793 * Swap these two, they are from our point of view. 2794 */2795
2796 newsk->daddr = saddr;
2797 newsk->saddr = daddr;
2798
2799 put_sock(newsk->num,newsk);
2800 newsk->dummy_th.res1 = 0;
2801 newsk->dummy_th.doff = 6;
2802 newsk->dummy_th.fin = 0;
2803 newsk->dummy_th.syn = 0;
2804 newsk->dummy_th.rst = 0;
2805 newsk->dummy_th.psh = 0;
2806 newsk->dummy_th.ack = 0;
2807 newsk->dummy_th.urg = 0;
2808 newsk->dummy_th.res2 = 0;
2809 newsk->acked_seq = skb->h.th->seq + 1;
2810 newsk->copied_seq = skb->h.th->seq + 1;
2811 newsk->socket = NULL;
2812
2813 /*2814 * Grab the ttl and tos values and use them 2815 */2816
2817 newsk->ip_ttl=sk->ip_ttl;
2818 newsk->ip_tos=skb->ip_hdr->tos;
2819
2820 /*2821 * Use 512 or whatever user asked for 2822 */2823
2824 /*2825 * Note use of sk->user_mss, since user has no direct access to newsk 2826 */2827
2828 rt=ip_rt_route(saddr, NULL,NULL);
2829
2830 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2831 newsk->window_clamp = rt->rt_window;
2832 else2833 newsk->window_clamp = 0;
2834
2835 if (sk->user_mss)
2836 newsk->mtu = sk->user_mss;
2837 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
2838 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2839 else2840 {2841 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */2842 if ((saddr ^ daddr) & default_mask(saddr))
2843 #else2844 if ((saddr ^ daddr) & dev->pa_mask)
2845 #endif2846 newsk->mtu = 576 - HEADER_SIZE;
2847 else2848 newsk->mtu = MAX_WINDOW;
2849 }2850
2851 /*2852 * But not bigger than device MTU 2853 */2854
2855 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2856
2857 /*2858 * This will min with what arrived in the packet 2859 */2860
2861 tcp_options(newsk,skb->h.th);
2862
2863 tcp_cache_zap();
2864
2865 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2866 if (buff == NULL)
2867 {2868 sk->err = ENOMEM;
2869 newsk->dead = 1;
2870 newsk->state = TCP_CLOSE;
2871 /* And this will destroy it */2872 release_sock(newsk);
2873 kfree_skb(skb, FREE_READ);
2874 tcp_statistics.TcpAttemptFails++;
2875 return;
2876 }2877
2878 buff->sk = newsk;
2879 buff->localroute = newsk->localroute;
2880
2881 /*2882 * Put in the IP header and routing stuff. 2883 */2884
2885 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2886 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2887
2888 /*2889 * Something went wrong. 2890 */2891
2892 if (tmp < 0)
2893 {2894 sk->err = tmp;
2895 buff->free = 1;
2896 kfree_skb(buff,FREE_WRITE);
2897 newsk->dead = 1;
2898 newsk->state = TCP_CLOSE;
2899 release_sock(newsk);
2900 skb->sk = sk;
2901 kfree_skb(skb, FREE_READ);
2902 tcp_statistics.TcpAttemptFails++;
2903 return;
2904 }2905
2906 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2907
2908 memcpy(t1, skb->h.th, sizeof(*t1));
2909 buff->h.seq = newsk->write_seq;
2910 /*2911 * Swap the send and the receive. 2912 */2913 t1->dest = skb->h.th->source;
2914 t1->source = newsk->dummy_th.source;
2915 t1->seq = ntohl(newsk->write_seq++);
2916 t1->ack = 1;
2917 newsk->window = tcp_select_window(newsk);
2918 newsk->sent_seq = newsk->write_seq;
2919 t1->window = ntohs(newsk->window);
2920 t1->res1 = 0;
2921 t1->res2 = 0;
2922 t1->rst = 0;
2923 t1->urg = 0;
2924 t1->psh = 0;
2925 t1->syn = 1;
2926 t1->ack_seq = ntohl(skb->h.th->seq+1);
2927 t1->doff = sizeof(*t1)/4+1;
2928 ptr = skb_put(buff,4);
2929 ptr[0] = 2;
2930 ptr[1] = 4;
2931 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2932 ptr[3] =(newsk->mtu) & 0xff;
2933
2934 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2935 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2936 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2937 skb->sk = newsk;
2938
2939 /*2940 * Charge the sock_buff to newsk. 2941 */2942
2943 sk->rmem_alloc -= skb->truesize;
2944 newsk->rmem_alloc += skb->truesize;
2945
2946 skb_queue_tail(&sk->receive_queue,skb);
2947 sk->ack_backlog++;
2948 release_sock(newsk);
2949 tcp_statistics.TcpOutSegs++;
2950 }2951
2952
2953 staticvoidtcp_close(structsock *sk, inttimeout)
/* */2954 {2955 /*2956 * We need to grab some memory, and put together a FIN, 2957 * and then put it into the queue to be sent.2958 */2959
2960 sk->inuse = 1;
2961
2962 if(th_cache_sk==sk)
2963 tcp_cache_zap();
2964 if(sk->state == TCP_LISTEN)
2965 {2966 /* Special case */2967 tcp_set_state(sk, TCP_CLOSE);
2968 tcp_close_pending(sk);
2969 release_sock(sk);
2970 return;
2971 }2972
2973 sk->keepopen = 1;
2974 sk->shutdown = SHUTDOWN_MASK;
2975
2976 if (!sk->dead)
2977 sk->state_change(sk);
2978
2979 if (timeout == 0)
2980 {2981 structsk_buff *skb;
2982
2983 /*2984 * We need to flush the recv. buffs. We do this only on the2985 * descriptor close, not protocol-sourced closes, because the2986 * reader process may not have drained the data yet!2987 */2988
2989 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2990 kfree_skb(skb, FREE_READ);
2991 /*2992 * Get rid off any half-completed packets. 2993 */2994
2995 if (sk->partial)
2996 tcp_send_partial(sk);
2997 }2998
2999
3000 /*3001 * Timeout is not the same thing - however the code likes3002 * to send both the same way (sigh).3003 */3004
3005 if(timeout)
3006 {3007 tcp_set_state(sk, TCP_CLOSE); /* Dead */3008 }3009 else3010 {3011 if(tcp_close_state(sk,1)==1)
3012 {3013 tcp_send_fin(sk);
3014 }3015 }3016 release_sock(sk);
3017 }3018
3019
3020 /*3021 * This routine takes stuff off of the write queue,3022 * and puts it in the xmit queue. This happens as incoming acks3023 * open up the remote window for us.3024 */3025
3026 staticvoidtcp_write_xmit(structsock *sk)
/* */3027 {3028 structsk_buff *skb;
3029
3030 /*3031 * The bytes will have to remain here. In time closedown will3032 * empty the write queue and all will be happy 3033 */3034
3035 if(sk->zapped)
3036 return;
3037
3038 /*3039 * Anything on the transmit queue that fits the window can3040 * be added providing we are not3041 *3042 * a) retransmitting (Nagle's rule)3043 * b) exceeding our congestion window.3044 */3045
3046 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3047 before(skb->h.seq, sk->window_seq + 1) &&
3048 (sk->retransmits == 0 ||
3049 sk->ip_xmit_timeout != TIME_WRITE ||
3050 before(skb->h.seq, sk->rcv_ack_seq + 1))
3051 && sk->packets_out < sk->cong_window)
3052 {3053 IS_SKB(skb);
3054 skb_unlink(skb);
3055
3056 /*3057 * See if we really need to send the packet. 3058 */3059
3060 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3061 {3062 /*3063 * This is acked data. We can discard it. This 3064 * cannot currently occur.3065 */3066
3067 sk->retransmits = 0;
3068 kfree_skb(skb, FREE_WRITE);
3069 if (!sk->dead)
3070 sk->write_space(sk);
3071 }3072 else3073 {3074 structtcphdr *th;
3075 structiphdr *iph;
3076 intsize;
3077 /*3078 * put in the ack seq and window at this point rather than earlier,3079 * in order to keep them monotonic. We really want to avoid taking3080 * back window allocations. That's legal, but RFC1122 says it's frowned on.3081 * Ack and window will in general have changed since this packet was put3082 * on the write queue.3083 */3084 iph = skb->ip_hdr;
3085 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3086 size = skb->len - (((unsignedchar *) th) - skb->data);
3087
3088 th->ack_seq = ntohl(sk->acked_seq);
3089 th->window = ntohs(tcp_select_window(sk));
3090
3091 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3092
3093 sk->sent_seq = skb->h.seq;
3094
3095 /*3096 * IP manages our queue for some crazy reason3097 */3098
3099 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3100
3101 /*3102 * Again we slide the timer wrongly3103 */3104
3105 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3106 }3107 }3108 }3109
3110
3111 /*3112 * This routine deals with incoming acks, but not outgoing ones.3113 */3114
3115 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3116 {3117 u32ack;
3118 intflag = 0;
3119
3120 /* 3121 * 1 - there was data in packet as well as ack or new data is sent or 3122 * in shutdown state3123 * 2 - data from retransmit queue was acked and removed3124 * 4 - window shrunk or data from retransmit queue was acked and removed3125 */3126
3127 if(sk->zapped)
3128 return(1); /* Dead, cant ack any more so why bother */3129
3130 /*3131 * Have we discovered a larger window3132 */3133
3134 ack = ntohl(th->ack_seq);
3135
3136 if (ntohs(th->window) > sk->max_window)
3137 {3138 sk->max_window = ntohs(th->window);
3139 #ifdefCONFIG_INET_PCTCP3140 /* Hack because we don't send partial packets to non SWS3141 handling hosts */3142 sk->mss = min(sk->max_window>>1, sk->mtu);
3143 #else3144 sk->mss = min(sk->max_window, sk->mtu);
3145 #endif3146 }3147
3148 /*3149 * We have dropped back to keepalive timeouts. Thus we have3150 * no retransmits pending.3151 */3152
3153 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3154 sk->retransmits = 0;
3155
3156 /*3157 * If the ack is newer than sent or older than previous acks3158 * then we can probably ignore it.3159 */3160
3161 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3162 {3163 if(sk->debug)
3164 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3165
3166 /*3167 * Keepalive processing.3168 */3169
3170 if (after(ack, sk->sent_seq))
3171 {3172 return(0);
3173 }3174
3175 /*3176 * Restart the keepalive timer.3177 */3178
3179 if (sk->keepopen)
3180 {3181 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3182 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3183 }3184 return(1);
3185 }3186
3187 /*3188 * If there is data set flag 13189 */3190
3191 if (len != th->doff*4)
3192 flag |= 1;
3193
3194 /*3195 * See if our window has been shrunk. 3196 */3197
3198 if (after(sk->window_seq, ack+ntohs(th->window)))
3199 {3200 /*3201 * We may need to move packets from the send queue3202 * to the write queue, if the window has been shrunk on us.3203 * The RFC says you are not allowed to shrink your window3204 * like this, but if the other end does, you must be able3205 * to deal with it.3206 */3207 structsk_buff *skb;
3208 structsk_buff *skb2;
3209 structsk_buff *wskb = NULL;
3210
3211 skb2 = sk->send_head;
3212 sk->send_head = NULL;
3213 sk->send_tail = NULL;
3214
3215 /*3216 * This is an artifact of a flawed concept. We want one3217 * queue and a smarter send routine when we send all.3218 */3219
3220 flag |= 4; /* Window changed */3221
3222 sk->window_seq = ack + ntohs(th->window);
3223 cli();
3224 while (skb2 != NULL)
3225 {3226 skb = skb2;
3227 skb2 = skb->link3;
3228 skb->link3 = NULL;
3229 if (after(skb->h.seq, sk->window_seq))
3230 {3231 if (sk->packets_out > 0)
3232 sk->packets_out--;
3233 /* We may need to remove this from the dev send list. */3234 if (skb->next != NULL)
3235 {3236 skb_unlink(skb);
3237 }3238 /* Now add it to the write_queue. */3239 if (wskb == NULL)
3240 skb_queue_head(&sk->write_queue,skb);
3241 else3242 skb_append(wskb,skb);
3243 wskb = skb;
3244 }3245 else3246 {3247 if (sk->send_head == NULL)
3248 {3249 sk->send_head = skb;
3250 sk->send_tail = skb;
3251 }3252 else3253 {3254 sk->send_tail->link3 = skb;
3255 sk->send_tail = skb;
3256 }3257 skb->link3 = NULL;
3258 }3259 }3260 sti();
3261 }3262
3263 /*3264 * Pipe has emptied3265 */3266
3267 if (sk->send_tail == NULL || sk->send_head == NULL)
3268 {3269 sk->send_head = NULL;
3270 sk->send_tail = NULL;
3271 sk->packets_out= 0;
3272 }3273
3274 /*3275 * Update the right hand window edge of the host3276 */3277
3278 sk->window_seq = ack + ntohs(th->window);
3279
3280 /*3281 * We don't want too many packets out there. 3282 */3283
3284 if (sk->ip_xmit_timeout == TIME_WRITE &&
3285 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3286 {3287 /* 3288 * This is Jacobson's slow start and congestion avoidance. 3289 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3290 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3291 * counter and increment it once every cwnd times. It's possible3292 * that this should be done only if sk->retransmits == 0. I'm3293 * interpreting "new data is acked" as including data that has3294 * been retransmitted but is just now being acked.3295 */3296 if (sk->cong_window < sk->ssthresh)
3297 /* 3298 * In "safe" area, increase3299 */3300 sk->cong_window++;
3301 else3302 {3303 /*3304 * In dangerous area, increase slowly. In theory this is3305 * sk->cong_window += 1 / sk->cong_window3306 */3307 if (sk->cong_count >= sk->cong_window)
3308 {3309 sk->cong_window++;
3310 sk->cong_count = 0;
3311 }3312 else3313 sk->cong_count++;
3314 }3315 }3316
3317 /*3318 * Remember the highest ack received.3319 */3320
3321 sk->rcv_ack_seq = ack;
3322
3323 /*3324 * If this ack opens up a zero window, clear backoff. It was3325 * being used to time the probes, and is probably far higher than3326 * it needs to be for normal retransmission.3327 */3328
3329 if (sk->ip_xmit_timeout == TIME_PROBE0)
3330 {3331 sk->retransmits = 0; /* Our probe was answered */3332
3333 /*3334 * Was it a usable window open ?3335 */3336
3337 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3338 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3339 {3340 sk->backoff = 0;
3341
3342 /*3343 * Recompute rto from rtt. this eliminates any backoff.3344 */3345
3346 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3347 if (sk->rto > 120*HZ)
3348 sk->rto = 120*HZ;
3349 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3350 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3351 .2 of a second is going to need huge windows (SIGH) */3352 sk->rto = 20;
3353 }3354 }3355
3356 /* 3357 * See if we can take anything off of the retransmit queue.3358 */3359
3360 while(sk->send_head != NULL)
3361 {3362 /* Check for a bug. */3363 if (sk->send_head->link3 &&
3364 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3365 printk("INET: tcp.c: *** bug send_list out of order.\n");
3366
3367 /*3368 * If our packet is before the ack sequence we can3369 * discard it as it's confirmed to have arrived the other end.3370 */3371
3372 if (before(sk->send_head->h.seq, ack+1))
3373 {3374 structsk_buff *oskb;
3375 if (sk->retransmits)
3376 {3377 /*3378 * We were retransmitting. don't count this in RTT est 3379 */3380 flag |= 2;
3381
3382 /*3383 * even though we've gotten an ack, we're still3384 * retransmitting as long as we're sending from3385 * the retransmit queue. Keeping retransmits non-zero3386 * prevents us from getting new data interspersed with3387 * retransmissions.3388 */3389
3390 if (sk->send_head->link3) /* Any more queued retransmits? */3391 sk->retransmits = 1;
3392 else3393 sk->retransmits = 0;
3394 }3395 /*3396 * Note that we only reset backoff and rto in the3397 * rtt recomputation code. And that doesn't happen3398 * if there were retransmissions in effect. So the3399 * first new packet after the retransmissions is3400 * sent with the backoff still in effect. Not until3401 * we get an ack from a non-retransmitted packet do3402 * we reset the backoff and rto. This allows us to deal3403 * with a situation where the network delay has increased3404 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3405 */3406
3407 /*3408 * We have one less packet out there. 3409 */3410
3411 if (sk->packets_out > 0)
3412 sk->packets_out --;
3413 /* 3414 * Wake up the process, it can probably write more. 3415 */3416 if (!sk->dead)
3417 sk->write_space(sk);
3418 oskb = sk->send_head;
3419
3420 if (!(flag&2)) /* Not retransmitting */3421 {3422 longm;
3423
3424 /*3425 * The following amusing code comes from Jacobson's3426 * article in SIGCOMM '88. Note that rtt and mdev3427 * are scaled versions of rtt and mean deviation.3428 * This is designed to be as fast as possible 3429 * m stands for "measurement".3430 */3431
3432 m = jiffies - oskb->when; /* RTT */3433 if(m<=0)
3434 m=1; /* IS THIS RIGHT FOR <0 ??? */3435 m -= (sk->rtt >> 3); /* m is now error in rtt est */3436 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3437 if (m < 0)
3438 m = -m; /* m is now abs(error) */3439 m -= (sk->mdev >> 2); /* similar update on mdev */3440 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3441
3442 /*3443 * Now update timeout. Note that this removes any backoff.3444 */3445
3446 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3447 if (sk->rto > 120*HZ)
3448 sk->rto = 120*HZ;
3449 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3450 sk->rto = 20;
3451 sk->backoff = 0;
3452 }3453 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3454 In this case as we just set it up */3455 cli();
3456 oskb = sk->send_head;
3457 IS_SKB(oskb);
3458 sk->send_head = oskb->link3;
3459 if (sk->send_head == NULL)
3460 {3461 sk->send_tail = NULL;
3462 }3463
3464 /*3465 * We may need to remove this from the dev send list. 3466 */3467
3468 if (oskb->next)
3469 skb_unlink(oskb);
3470 sti();
3471 kfree_skb(oskb, FREE_WRITE); /* write. */3472 if (!sk->dead)
3473 sk->write_space(sk);
3474 }3475 else3476 {3477 break;
3478 }3479 }3480
3481 /*3482 * XXX someone ought to look at this too.. at the moment, if skb_peek()3483 * returns non-NULL, we complete ignore the timer stuff in the else3484 * clause. We ought to organize the code so that else clause can3485 * (should) be executed regardless, possibly moving the PROBE timer3486 * reset over. The skb_peek() thing should only move stuff to the3487 * write queue, NOT also manage the timer functions.3488 */3489
3490 /*3491 * Maybe we can take some stuff off of the write queue,3492 * and put it onto the xmit queue.3493 */3494 if (skb_peek(&sk->write_queue) != NULL)
3495 {3496 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3497 (sk->retransmits == 0 ||
3498 sk->ip_xmit_timeout != TIME_WRITE ||
3499 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3500 && sk->packets_out < sk->cong_window)
3501 {3502 /*3503 * Add more data to the send queue.3504 */3505 flag |= 1;
3506 tcp_write_xmit(sk);
3507 }3508 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3509 sk->send_head == NULL &&
3510 sk->ack_backlog == 0 &&
3511 sk->state != TCP_TIME_WAIT)
3512 {3513 /*3514 * Data to queue but no room.3515 */3516 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3517 }3518 }3519 else3520 {3521 /*3522 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3523 * from TCP_CLOSE we don't do anything3524 *3525 * from anything else, if there is write data (or fin) pending,3526 * we use a TIME_WRITE timeout, else if keepalive we reset to3527 * a KEEPALIVE timeout, else we delete the timer.3528 *3529 * We do not set flag for nominal write data, otherwise we may3530 * force a state where we start to write itsy bitsy tidbits3531 * of data.3532 */3533
3534 switch(sk->state) {3535 caseTCP_TIME_WAIT:
3536 /*3537 * keep us in TIME_WAIT until we stop getting packets,3538 * reset the timeout.3539 */3540 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3541 break;
3542 caseTCP_CLOSE:
3543 /*3544 * don't touch the timer.3545 */3546 break;
3547 default:
3548 /*3549 * Must check send_head, write_queue, and ack_backlog3550 * to determine which timeout to use.3551 */3552 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3553 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3554 }elseif (sk->keepopen) {3555 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3556 }else{3557 del_timer(&sk->retransmit_timer);
3558 sk->ip_xmit_timeout = 0;
3559 }3560 break;
3561 }3562 }3563
3564 /*3565 * We have nothing queued but space to send. Send any partial3566 * packets immediately (end of Nagle rule application).3567 */3568
3569 if (sk->packets_out == 0 && sk->partial != NULL &&
3570 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3571 {3572 flag |= 1;
3573 tcp_send_partial(sk);
3574 }3575
3576 /*3577 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3578 * we are now waiting for an acknowledge to our FIN. The other end is3579 * already in TIME_WAIT.3580 *3581 * Move to TCP_CLOSE on success.3582 */3583
3584 if (sk->state == TCP_LAST_ACK)
3585 {3586 if (!sk->dead)
3587 sk->state_change(sk);
3588 if(sk->debug)
3589 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3590 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3591 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3592 {3593 flag |= 1;
3594 tcp_set_state(sk,TCP_CLOSE);
3595 sk->shutdown = SHUTDOWN_MASK;
3596 }3597 }3598
3599 /*3600 * Incoming ACK to a FIN we sent in the case of our initiating the close.3601 *3602 * Move to FIN_WAIT2 to await a FIN from the other end. Set3603 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3604 */3605
3606 if (sk->state == TCP_FIN_WAIT1)
3607 {3608
3609 if (!sk->dead)
3610 sk->state_change(sk);
3611 if (sk->rcv_ack_seq == sk->write_seq)
3612 {3613 flag |= 1;
3614 sk->shutdown |= SEND_SHUTDOWN;
3615 tcp_set_state(sk, TCP_FIN_WAIT2);
3616 }3617 }3618
3619 /*3620 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3621 *3622 * Move to TIME_WAIT3623 */3624
3625 if (sk->state == TCP_CLOSING)
3626 {3627
3628 if (!sk->dead)
3629 sk->state_change(sk);
3630 if (sk->rcv_ack_seq == sk->write_seq)
3631 {3632 flag |= 1;
3633 tcp_time_wait(sk);
3634 }3635 }3636
3637 /*3638 * Final ack of a three way shake 3639 */3640
3641 if(sk->state==TCP_SYN_RECV)
3642 {3643 tcp_set_state(sk, TCP_ESTABLISHED);
3644 tcp_options(sk,th);
3645 sk->dummy_th.dest=th->source;
3646 sk->copied_seq = sk->acked_seq;
3647 if(!sk->dead)
3648 sk->state_change(sk);
3649 if(sk->max_window==0)
3650 {3651 sk->max_window=32; /* Sanity check */3652 sk->mss=min(sk->max_window,sk->mtu);
3653 }3654 }3655
3656 /*3657 * I make no guarantees about the first clause in the following3658 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3659 * what conditions "!flag" would be true. However I think the rest3660 * of the conditions would prevent that from causing any3661 * unnecessary retransmission. 3662 * Clearly if the first packet has expired it should be 3663 * retransmitted. The other alternative, "flag&2 && retransmits", is3664 * harder to explain: You have to look carefully at how and when the3665 * timer is set and with what timeout. The most recent transmission always3666 * sets the timer. So in general if the most recent thing has timed3667 * out, everything before it has as well. So we want to go ahead and3668 * retransmit some more. If we didn't explicitly test for this3669 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3670 * would not be true. If you look at the pattern of timing, you can3671 * show that rto is increased fast enough that the next packet would3672 * almost never be retransmitted immediately. Then you'd end up3673 * waiting for a timeout to send each packet on the retransmission3674 * queue. With my implementation of the Karn sampling algorithm,3675 * the timeout would double each time. The net result is that it would3676 * take a hideous amount of time to recover from a single dropped packet.3677 * It's possible that there should also be a test for TIME_WRITE, but3678 * I think as long as "send_head != NULL" and "retransmit" is on, we've3679 * got to be in real retransmission mode.3680 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3681 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3682 * As long as no further losses occur, this seems reasonable.3683 */3684
3685 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3686 (((flag&2) && sk->retransmits) ||
3687 (sk->send_head->when + sk->rto < jiffies)))
3688 {3689 if(sk->send_head->when + sk->rto < jiffies)
3690 tcp_retransmit(sk,0);
3691 else3692 {3693 tcp_do_retransmit(sk, 1);
3694 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3695 }3696 }3697
3698 return(1);
3699 }3700
3701
3702 /*3703 * Process the FIN bit. This now behaves as it is supposed to work3704 * and the FIN takes effect when it is validly part of sequence3705 * space. Not before when we get holes.3706 *3707 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3708 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3709 * TIME-WAIT)3710 *3711 * If we are in FINWAIT-1, a received FIN indicates simultaneous3712 * close and we go into CLOSING (and later onto TIME-WAIT)3713 *3714 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3715 *3716 */3717
3718 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3719 {3720 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3721
3722 if (!sk->dead)
3723 {3724 sk->state_change(sk);
3725 sock_wake_async(sk->socket, 1);
3726 }3727
3728 switch(sk->state)
3729 {3730 caseTCP_SYN_RECV:
3731 caseTCP_SYN_SENT:
3732 caseTCP_ESTABLISHED:
3733 /*3734 * move to CLOSE_WAIT, tcp_data() already handled3735 * sending the ack.3736 */3737 tcp_set_state(sk,TCP_CLOSE_WAIT);
3738 if (th->rst)
3739 sk->shutdown = SHUTDOWN_MASK;
3740 break;
3741
3742 caseTCP_CLOSE_WAIT:
3743 caseTCP_CLOSING:
3744 /*3745 * received a retransmission of the FIN, do3746 * nothing.3747 */3748 break;
3749 caseTCP_TIME_WAIT:
3750 /*3751 * received a retransmission of the FIN,3752 * restart the TIME_WAIT timer.3753 */3754 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3755 return(0);
3756 caseTCP_FIN_WAIT1:
3757 /*3758 * This case occurs when a simultaneous close3759 * happens, we must ack the received FIN and3760 * enter the CLOSING state.3761 *3762 * This causes a WRITE timeout, which will either3763 * move on to TIME_WAIT when we timeout, or resend3764 * the FIN properly (maybe we get rid of that annoying3765 * FIN lost hang). The TIME_WRITE code is already correct3766 * for handling this timeout.3767 */3768
3769 if(sk->ip_xmit_timeout != TIME_WRITE)
3770 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3771 tcp_set_state(sk,TCP_CLOSING);
3772 break;
3773 caseTCP_FIN_WAIT2:
3774 /*3775 * received a FIN -- send ACK and enter TIME_WAIT3776 */3777 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3778 sk->shutdown|=SHUTDOWN_MASK;
3779 tcp_set_state(sk,TCP_TIME_WAIT);
3780 break;
3781 caseTCP_CLOSE:
3782 /*3783 * already in CLOSE3784 */3785 break;
3786 default:
3787 tcp_set_state(sk,TCP_LAST_ACK);
3788
3789 /* Start the timers. */3790 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3791 return(0);
3792 }3793
3794 return(0);
3795 }3796
3797
3798
3799 /*3800 * This routine handles the data. If there is room in the buffer,3801 * it will be have already been moved into it. If there is no3802 * room, then we will just have to discard the packet.3803 */3804
3805 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */3806 unsignedlongsaddr, unsignedshortlen)
3807 {3808 structsk_buff *skb1, *skb2;
3809 structtcphdr *th;
3810 intdup_dumped=0;
3811 u32new_seq, shut_seq;
3812
3813 th = skb->h.th;
3814 skb_pull(skb,th->doff*4);
3815 skb_trim(skb,len-(th->doff*4));
3816
3817 /*3818 * The bytes in the receive read/assembly queue has increased. Needed for the3819 * low memory discard algorithm 3820 */3821
3822 sk->bytes_rcv += skb->len;
3823
3824 if (skb->len == 0 && !th->fin)
3825 {3826 /* 3827 * Don't want to keep passing ack's back and forth. 3828 * (someone sent us dataless, boring frame)3829 */3830 if (!th->ack)
3831 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3832 kfree_skb(skb, FREE_READ);
3833 return(0);
3834 }3835
3836 /*3837 * We no longer have anyone receiving data on this connection.3838 */3839
3840 #ifndef TCP_DONT_RST_SHUTDOWN
3841
3842 if(sk->shutdown & RCV_SHUTDOWN)
3843 {3844 /*3845 * FIXME: BSD has some magic to avoid sending resets to3846 * broken 4.2 BSD keepalives. Much to my surprise a few non3847 * BSD stacks still have broken keepalives so we want to3848 * cope with it.3849 */3850
3851 if(skb->len) /* We don't care if it's just an ack or3852 a keepalive/window probe */3853 {3854 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */3855
3856 /* Do this the way 4.4BSD treats it. Not what I'd3857 regard as the meaning of the spec but it's what BSD3858 does and clearly they know everything 8) */3859
3860 /*3861 * This is valid because of two things3862 *3863 * a) The way tcp_data behaves at the bottom.3864 * b) A fin takes effect when read not when received.3865 */3866
3867 shut_seq=sk->acked_seq+1; /* Last byte */3868
3869 if(after(new_seq,shut_seq))
3870 {3871 if(sk->debug)
3872 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
3873 sk, new_seq, shut_seq, sk->blog);
3874 if(sk->dead)
3875 {3876 sk->acked_seq = new_seq + th->fin;
3877 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3878 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3879 tcp_statistics.TcpEstabResets++;
3880 tcp_set_state(sk,TCP_CLOSE);
3881 sk->err = EPIPE;
3882 sk->shutdown = SHUTDOWN_MASK;
3883 kfree_skb(skb, FREE_READ);
3884 return 0;
3885 }3886 }3887 }3888 }3889
3890 #endif3891
3892 /*3893 * Now we have to walk the chain, and figure out where this one3894 * goes into it. This is set up so that the last packet we received3895 * will be the first one we look at, that way if everything comes3896 * in order, there will be no performance loss, and if they come3897 * out of order we will be able to fit things in nicely.3898 *3899 * [AC: This is wrong. We should assume in order first and then walk3900 * forwards from the first hole based upon real traffic patterns.]3901 * 3902 */3903
3904 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */3905 {3906 skb_queue_head(&sk->receive_queue,skb);
3907 skb1= NULL;
3908 }3909 else3910 {3911 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3912 {3913 if(sk->debug)
3914 {3915 printk("skb1=%p :", skb1);
3916 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
3917 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
3918 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
3919 sk->acked_seq);
3920 }3921
3922 /*3923 * Optimisation: Duplicate frame or extension of previous frame from3924 * same sequence point (lost ack case).3925 * The frame contains duplicate data or replaces a previous frame3926 * discard the previous frame (safe as sk->inuse is set) and put3927 * the new one in its place.3928 */3929
3930 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3931 {3932 skb_append(skb1,skb);
3933 skb_unlink(skb1);
3934 kfree_skb(skb1,FREE_READ);
3935 dup_dumped=1;
3936 skb1=NULL;
3937 break;
3938 }3939
3940 /*3941 * Found where it fits3942 */3943
3944 if (after(th->seq+1, skb1->h.th->seq))
3945 {3946 skb_append(skb1,skb);
3947 break;
3948 }3949
3950 /*3951 * See if we've hit the start. If so insert.3952 */3953 if (skb1 == skb_peek(&sk->receive_queue))
3954 {3955 skb_queue_head(&sk->receive_queue, skb);
3956 break;
3957 }3958 }3959 }3960
3961 /*3962 * Figure out what the ack value for this frame is3963 */3964
3965 th->ack_seq = th->seq + skb->len;
3966 if (th->syn)
3967 th->ack_seq++;
3968 if (th->fin)
3969 th->ack_seq++;
3970
3971 if (before(sk->acked_seq, sk->copied_seq))
3972 {3973 printk("*** tcp.c:tcp_data bug acked < copied\n");
3974 sk->acked_seq = sk->copied_seq;
3975 }3976
3977 /*3978 * Now figure out if we can ack anything. This is very messy because we really want two3979 * receive queues, a completed and an assembly queue. We also want only one transmit3980 * queue.3981 */3982
3983 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3984 {3985 if (before(th->seq, sk->acked_seq+1))
3986 {3987 intnewwindow;
3988
3989 if (after(th->ack_seq, sk->acked_seq))
3990 {3991 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3992 if (newwindow < 0)
3993 newwindow = 0;
3994 sk->window = newwindow;
3995 sk->acked_seq = th->ack_seq;
3996 }3997 skb->acked = 1;
3998
3999 /*4000 * When we ack the fin, we do the FIN 4001 * processing.4002 */4003
4004 if (skb->h.th->fin)
4005 {4006 tcp_fin(skb,sk,skb->h.th);
4007 }4008
4009 for(skb2 = skb->next;
4010 skb2 != (structsk_buff *)&sk->receive_queue;
4011 skb2 = skb2->next)
4012 {4013 if (before(skb2->h.th->seq, sk->acked_seq+1))
4014 {4015 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4016 {4017 newwindow = sk->window -
4018 (skb2->h.th->ack_seq - sk->acked_seq);
4019 if (newwindow < 0)
4020 newwindow = 0;
4021 sk->window = newwindow;
4022 sk->acked_seq = skb2->h.th->ack_seq;
4023 }4024 skb2->acked = 1;
4025 /*4026 * When we ack the fin, we do4027 * the fin handling.4028 */4029 if (skb2->h.th->fin)
4030 {4031 tcp_fin(skb,sk,skb->h.th);
4032 }4033
4034 /*4035 * Force an immediate ack.4036 */4037
4038 sk->ack_backlog = sk->max_ack_backlog;
4039 }4040 else4041 {4042 break;
4043 }4044 }4045
4046 /*4047 * This also takes care of updating the window.4048 * This if statement needs to be simplified.4049 */4050 if (!sk->delay_acks ||
4051 sk->ack_backlog >= sk->max_ack_backlog ||
4052 sk->bytes_rcv > sk->max_unacked || th->fin) {4053 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4054 }4055 else4056 {4057 sk->ack_backlog++;
4058 if(sk->debug)
4059 printk("Ack queued.\n");
4060 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4061 }4062 }4063 }4064
4065 /*4066 * If we've missed a packet, send an ack.4067 * Also start a timer to send another.4068 */4069
4070 if (!skb->acked)
4071 {4072
4073 /*4074 * This is important. If we don't have much room left,4075 * we need to throw out a few packets so we have a good4076 * window. Note that mtu is used, not mss, because mss is really4077 * for the send side. He could be sending us stuff as large as mtu.4078 */4079
4080 while (sk->prot->rspace(sk) < sk->mtu)
4081 {4082 skb1 = skb_peek(&sk->receive_queue);
4083 if (skb1 == NULL)
4084 {4085 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4086 break;
4087 }4088
4089 /*4090 * Don't throw out something that has been acked. 4091 */4092
4093 if (skb1->acked)
4094 {4095 break;
4096 }4097
4098 skb_unlink(skb1);
4099 kfree_skb(skb1, FREE_READ);
4100 }4101 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4102 sk->ack_backlog++;
4103 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4104 }4105 else4106 {4107 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4108 }4109
4110 /*4111 * Now tell the user we may have some data. 4112 */4113
4114 if (!sk->dead)
4115 {4116 if(sk->debug)
4117 printk("Data wakeup.\n");
4118 sk->data_ready(sk,0);
4119 }4120 return(0);
4121 }4122
4123
4124 /*4125 * This routine is only called when we have urgent data4126 * signalled. Its the 'slow' part of tcp_urg. It could be4127 * moved inline now as tcp_urg is only called from one4128 * place. We handle URGent data wrong. We have to - as4129 * BSD still doesn't use the correction from RFC961.4130 */4131
4132 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4133 {4134 u32ptr = ntohs(th->urg_ptr);
4135
4136 if (ptr)
4137 ptr--;
4138 ptr += th->seq;
4139
4140 /* ignore urgent data that we've already seen and read */4141 if (after(sk->copied_seq, ptr))
4142 return;
4143
4144 /* do we already have a newer (or duplicate) urgent pointer? */4145 if (sk->urg_data && !after(ptr, sk->urg_seq))
4146 return;
4147
4148 /* tell the world about our new urgent pointer */4149 if (sk->proc != 0) {4150 if (sk->proc > 0) {4151 kill_proc(sk->proc, SIGURG, 1);
4152 }else{4153 kill_pg(-sk->proc, SIGURG, 1);
4154 }4155 }4156 sk->urg_data = URG_NOTYET;
4157 sk->urg_seq = ptr;
4158 }4159
4160 /*4161 * This is the 'fast' part of urgent handling.4162 */4163
4164 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4165 unsignedlongsaddr, unsignedlonglen)
4166 {4167 u32ptr;
4168
4169 /*4170 * Check if we get a new urgent pointer - normally not 4171 */4172
4173 if (th->urg)
4174 tcp_check_urg(sk,th);
4175
4176 /*4177 * Do we wait for any urgent data? - normally not4178 */4179
4180 if (sk->urg_data != URG_NOTYET)
4181 return 0;
4182
4183 /*4184 * Is the urgent pointer pointing into this packet? 4185 */4186
4187 ptr = sk->urg_seq - th->seq + th->doff*4;
4188 if (ptr >= len)
4189 return 0;
4190
4191 /*4192 * Ok, got the correct packet, update info 4193 */4194
4195 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4196 if (!sk->dead)
4197 sk->data_ready(sk,0);
4198 return 0;
4199 }4200
4201 /*4202 * This will accept the next outstanding connection. 4203 */4204
4205 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4206 {4207 structsock *newsk;
4208 structsk_buff *skb;
4209
4210 /*4211 * We need to make sure that this socket is listening,4212 * and that it has something pending.4213 */4214
4215 if (sk->state != TCP_LISTEN)
4216 {4217 sk->err = EINVAL;
4218 return(NULL);
4219 }4220
4221 /* Avoid the race. */4222 cli();
4223 sk->inuse = 1;
4224
4225 while((skb = tcp_dequeue_established(sk)) == NULL)
4226 {4227 if (flags & O_NONBLOCK)
4228 {4229 sti();
4230 release_sock(sk);
4231 sk->err = EAGAIN;
4232 return(NULL);
4233 }4234
4235 release_sock(sk);
4236 interruptible_sleep_on(sk->sleep);
4237 if (current->signal & ~current->blocked)
4238 {4239 sti();
4240 sk->err = ERESTARTSYS;
4241 return(NULL);
4242 }4243 sk->inuse = 1;
4244 }4245 sti();
4246
4247 /*4248 * Now all we need to do is return skb->sk. 4249 */4250
4251 newsk = skb->sk;
4252
4253 kfree_skb(skb, FREE_READ);
4254 sk->ack_backlog--;
4255 release_sock(sk);
4256 return(newsk);
4257 }4258
4259
4260 /*4261 * This will initiate an outgoing connection. 4262 */4263
4264 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4265 {4266 structsk_buff *buff;
4267 structdevice *dev=NULL;
4268 unsignedchar *ptr;
4269 inttmp;
4270 intatype;
4271 structtcphdr *t1;
4272 structrtable *rt;
4273
4274 if (sk->state != TCP_CLOSE)
4275 {4276 return(-EISCONN);
4277 }4278
4279 if (addr_len < 8)
4280 return(-EINVAL);
4281
4282 if (usin->sin_family && usin->sin_family != AF_INET)
4283 return(-EAFNOSUPPORT);
4284
4285 /*4286 * connect() to INADDR_ANY means loopback (BSD'ism).4287 */4288
4289 if(usin->sin_addr.s_addr==INADDR_ANY)
4290 usin->sin_addr.s_addr=ip_my_addr();
4291
4292 /*4293 * Don't want a TCP connection going to a broadcast address 4294 */4295
4296 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4297 return -ENETUNREACH;
4298
4299 sk->inuse = 1;
4300 sk->daddr = usin->sin_addr.s_addr;
4301 sk->write_seq = tcp_init_seq();
4302 sk->window_seq = sk->write_seq;
4303 sk->rcv_ack_seq = sk->write_seq -1;
4304 sk->err = 0;
4305 sk->dummy_th.dest = usin->sin_port;
4306 release_sock(sk);
4307
4308 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4309 if (buff == NULL)
4310 {4311 return(-ENOMEM);
4312 }4313 sk->inuse = 1;
4314 buff->sk = sk;
4315 buff->free = 0;
4316 buff->localroute = sk->localroute;
4317
4318
4319 /*4320 * Put in the IP header and routing stuff. 4321 */4322
4323 rt=ip_rt_route(sk->daddr, NULL, NULL);
4324
4325
4326 /*4327 * We need to build the routing stuff from the things saved in skb. 4328 */4329
4330 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4331 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4332 if (tmp < 0)
4333 {4334 sk->prot->wfree(sk, buff);
4335 release_sock(sk);
4336 return(-ENETUNREACH);
4337 }4338
4339 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
4340
4341 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4342 t1->seq = ntohl(sk->write_seq++);
4343 sk->sent_seq = sk->write_seq;
4344 buff->h.seq = sk->write_seq;
4345 t1->ack = 0;
4346 t1->window = 2;
4347 t1->res1=0;
4348 t1->res2=0;
4349 t1->rst = 0;
4350 t1->urg = 0;
4351 t1->psh = 0;
4352 t1->syn = 1;
4353 t1->urg_ptr = 0;
4354 t1->doff = 6;
4355 /* use 512 or whatever user asked for */4356
4357 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4358 sk->window_clamp=rt->rt_window;
4359 else4360 sk->window_clamp=0;
4361
4362 if (sk->user_mss)
4363 sk->mtu = sk->user_mss;
4364 elseif(rt!=NULL && (rt->rt_flags&RTF_MTU))
4365 sk->mtu = rt->rt_mss;
4366 else4367 {4368 #ifdefCONFIG_INET_SNARL4369 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4370 #else4371 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4372 #endif4373 sk->mtu = 576 - HEADER_SIZE;
4374 else4375 sk->mtu = MAX_WINDOW;
4376 }4377 /*4378 * but not bigger than device MTU 4379 */4380
4381 if(sk->mtu <32)
4382 sk->mtu = 32; /* Sanity limit */4383
4384 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4385
4386 /*4387 * Put in the TCP options to say MTU. 4388 */4389
4390 ptr = skb_put(buff,4);
4391 ptr[0] = 2;
4392 ptr[1] = 4;
4393 ptr[2] = (sk->mtu) >> 8;
4394 ptr[3] = (sk->mtu) & 0xff;
4395 tcp_send_check(t1, sk->saddr, sk->daddr,
4396 sizeof(structtcphdr) + 4, sk);
4397
4398 /*4399 * This must go first otherwise a really quick response will get reset. 4400 */4401
4402 tcp_cache_zap();
4403 tcp_set_state(sk,TCP_SYN_SENT);
4404 if(rt&&rt->rt_flags&RTF_IRTT)
4405 sk->rto = rt->rt_irtt;
4406 else4407 sk->rto = TCP_TIMEOUT_INIT;
4408 sk->retransmit_timer.function=&retransmit_timer;
4409 sk->retransmit_timer.data = (unsignedlong)sk;
4410 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4411 sk->retransmits = 0; /* Now works the right way instead of a hacked initial setting */4412
4413 sk->prot->queue_xmit(sk, dev, buff, 0);
4414 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4415 tcp_statistics.TcpActiveOpens++;
4416 tcp_statistics.TcpOutSegs++;
4417
4418 release_sock(sk);
4419 return(0);
4420 }4421
4422
4423 /* This functions checks to see if the tcp header is actually acceptable. */4424 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4425 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4426 {4427 u32next_seq;
4428
4429 next_seq = len - 4*th->doff;
4430 if (th->fin)
4431 next_seq++;
4432 /* if we have a zero window, we can't have any data in the packet.. */4433 if (next_seq && !sk->window)
4434 gotoignore_it;
4435 next_seq += th->seq;
4436
4437 /*4438 * This isn't quite right. sk->acked_seq could be more recent4439 * than sk->window. This is however close enough. We will accept4440 * slightly more packets than we should, but it should not cause4441 * problems unless someone is trying to forge packets.4442 */4443
4444 /* have we already seen all of this packet? */4445 if (!after(next_seq+1, sk->acked_seq))
4446 gotoignore_it;
4447 /* or does it start beyond the window? */4448 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4449 gotoignore_it;
4450
4451 /* ok, at least part of this packet would seem interesting.. */4452 return 1;
4453
4454 ignore_it:
4455 if (th->rst)
4456 return 0;
4457
4458 /*4459 * Send a reset if we get something not ours and we are4460 * unsynchronized. Note: We don't do anything to our end. We4461 * are just killing the bogus remote connection then we will4462 * connect again and it will work (with luck).4463 */4464
4465 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4466 {4467 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4468 return 1;
4469 }4470
4471 /* Try to resync things. */4472 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4473 return 0;
4474 }4475
4476 /*4477 * When we get a reset we do this.4478 */4479
4480 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4481 {4482 sk->zapped = 1;
4483 sk->err = ECONNRESET;
4484 if (sk->state == TCP_SYN_SENT)
4485 sk->err = ECONNREFUSED;
4486 if (sk->state == TCP_CLOSE_WAIT)
4487 sk->err = EPIPE;
4488 #ifdef TCP_DO_RFC1337
4489 /*4490 * Time wait assassination protection [RFC1337]4491 */4492 if(sk->state!=TCP_TIME_WAIT)
4493 {4494 tcp_set_state(sk,TCP_CLOSE);
4495 sk->shutdown = SHUTDOWN_MASK;
4496 }4497 #else4498 tcp_set_state(sk,TCP_CLOSE);
4499 sk->shutdown = SHUTDOWN_MASK;
4500 #endif4501 if (!sk->dead)
4502 sk->state_change(sk);
4503 kfree_skb(skb, FREE_READ);
4504 release_sock(sk);
4505 return(0);
4506 }4507
4508 /*4509 * A TCP packet has arrived.4510 * skb->h.raw is the TCP header.4511 */4512
4513 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4514 unsignedlongdaddr, unsignedshortlen,
4515 unsignedlongsaddr, intredo, structinet_protocol * protocol)
4516 {4517 structtcphdr *th;
4518 structsock *sk;
4519 intsyn_ok=0;
4520
4521 tcp_statistics.TcpInSegs++;
4522 if(skb->pkt_type!=PACKET_HOST)
4523 {4524 kfree_skb(skb,FREE_READ);
4525 return(0);
4526 }4527
4528 th = skb->h.th;
4529
4530 /*4531 * Find the socket, using the last hit cache if applicable.4532 */4533
4534 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4535 sk=(structsock *)th_cache_sk;
4536 else4537 {4538 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4539 th_cache_saddr=saddr;
4540 th_cache_daddr=daddr;
4541 th_cache_dport=th->dest;
4542 th_cache_sport=th->source;
4543 th_cache_sk=sk;
4544 }4545
4546 /*4547 * If this socket has got a reset it's to all intents and purposes 4548 * really dead. Count closed sockets as dead.4549 *4550 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4551 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4552 * exist so should cause resets as if the port was unreachable.4553 */4554
4555 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4556 sk=NULL;
4557
4558 if (!redo)
4559 {4560 /*4561 * Pull up the IP header.4562 */4563 skb_pull(skb, skb->h.raw-skb->data);
4564 /*4565 * Try to use the device checksum if provided.4566 */4567 if (
4568 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4569 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4570 )
4571 {4572 skb->sk = NULL;
4573 kfree_skb(skb,FREE_READ);
4574 /*4575 * We don't release the socket because it was4576 * never marked in use.4577 */4578 return(0);
4579 }4580 th->seq = ntohl(th->seq);
4581
4582 /* See if we know about the socket. */4583 if (sk == NULL)
4584 {4585 /*4586 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4587 */4588 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4589 skb->sk = NULL;
4590 /*4591 * Discard frame4592 */4593 kfree_skb(skb, FREE_READ);
4594 return(0);
4595 }4596
4597 /* skb->len = len;*/4598 skb->acked = 0;
4599 skb->used = 0;
4600 skb->free = 0;
4601 skb->saddr = daddr;
4602 skb->daddr = saddr;
4603
4604 /* We may need to add it to the backlog here. */4605 cli();
4606 if (sk->inuse)
4607 {4608 skb_queue_tail(&sk->back_log, skb);
4609 sti();
4610 return(0);
4611 }4612 sk->inuse = 1;
4613 sti();
4614 }4615 else4616 {4617 if (sk==NULL)
4618 {4619 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4620 skb->sk = NULL;
4621 kfree_skb(skb, FREE_READ);
4622 return(0);
4623 }4624 }4625
4626
4627 if (!sk->prot)
4628 {4629 printk("IMPOSSIBLE 3\n");
4630 return(0);
4631 }4632
4633
4634 /*4635 * Charge the memory to the socket. 4636 */4637
4638 if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf)
4639 {4640 kfree_skb(skb, FREE_READ);
4641 release_sock(sk);
4642 return(0);
4643 }4644
4645 skb->sk=sk;
4646 sk->rmem_alloc += skb->truesize;
4647
4648 /*4649 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4650 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4651 * compatibility. We also set up variables more thoroughly [Karn notes in the4652 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4653 */4654
4655 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4656 {4657
4658 /*4659 * Now deal with unusual cases.4660 */4661
4662 if(sk->state==TCP_LISTEN)
4663 {4664 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4665 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4666
4667 /*4668 * We don't care for RST, and non SYN are absorbed (old segments)4669 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4670 * netmask on a running connection it can go broadcast. Even Sun's have4671 * this problem so I'm ignoring it 4672 */4673
4674 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4675 {4676 kfree_skb(skb, FREE_READ);
4677 release_sock(sk);
4678 return 0;
4679 }4680
4681 /* 4682 * Guess we need to make a new socket up 4683 */4684
4685 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4686
4687 /*4688 * Now we have several options: In theory there is nothing else4689 * in the frame. KA9Q has an option to send data with the syn,4690 * BSD accepts data with the syn up to the [to be] advertised window4691 * and Solaris 2.1 gives you a protocol error. For now we just ignore4692 * it, that fits the spec precisely and avoids incompatibilities. It4693 * would be nice in future to drop through and process the data.4694 */4695
4696 release_sock(sk);
4697 return 0;
4698 }4699
4700 /* retransmitted SYN? */4701 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4702 {4703 kfree_skb(skb, FREE_READ);
4704 release_sock(sk);
4705 return 0;
4706 }4707
4708 /*4709 * SYN sent means we have to look for a suitable ack and either reset4710 * for bad matches or go to connected 4711 */4712
4713 if(sk->state==TCP_SYN_SENT)
4714 {4715 /* Crossed SYN or previous junk segment */4716 if(th->ack)
4717 {4718 /* We got an ack, but it's not a good ack */4719 if(!tcp_ack(sk,th,saddr,len))
4720 {4721 /* Reset the ack - its an ack from a 4722 different connection [ th->rst is checked in tcp_reset()] */4723 tcp_statistics.TcpAttemptFails++;
4724 tcp_reset(daddr, saddr, th,
4725 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4726 kfree_skb(skb, FREE_READ);
4727 release_sock(sk);
4728 return(0);
4729 }4730 if(th->rst)
4731 returntcp_std_reset(sk,skb);
4732 if(!th->syn)
4733 {4734 /* A valid ack from a different connection4735 start. Shouldn't happen but cover it */4736 kfree_skb(skb, FREE_READ);
4737 release_sock(sk);
4738 return 0;
4739 }4740 /*4741 * Ok.. it's good. Set up sequence numbers and4742 * move to established.4743 */4744 syn_ok=1; /* Don't reset this connection for the syn */4745 sk->acked_seq=th->seq+1;
4746 sk->fin_seq=th->seq;
4747 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4748 tcp_set_state(sk, TCP_ESTABLISHED);
4749 tcp_options(sk,th);
4750 sk->dummy_th.dest=th->source;
4751 sk->copied_seq = sk->acked_seq;
4752 if(!sk->dead)
4753 {4754 sk->state_change(sk);
4755 sock_wake_async(sk->socket, 0);
4756 }4757 if(sk->max_window==0)
4758 {4759 sk->max_window = 32;
4760 sk->mss = min(sk->max_window, sk->mtu);
4761 }4762 }4763 else4764 {4765 /* See if SYN's cross. Drop if boring */4766 if(th->syn && !th->rst)
4767 {4768 /* Crossed SYN's are fine - but talking to4769 yourself is right out... */4770 if(sk->saddr==saddr && sk->daddr==daddr &&
4771 sk->dummy_th.source==th->source &&
4772 sk->dummy_th.dest==th->dest)
4773 {4774 tcp_statistics.TcpAttemptFails++;
4775 returntcp_std_reset(sk,skb);
4776 }4777 tcp_set_state(sk,TCP_SYN_RECV);
4778
4779 /*4780 * FIXME:4781 * Must send SYN|ACK here4782 */4783 }4784 /* Discard junk segment */4785 kfree_skb(skb, FREE_READ);
4786 release_sock(sk);
4787 return 0;
4788 }4789 /*4790 * SYN_RECV with data maybe.. drop through4791 */4792 gotorfc_step6;
4793 }4794
4795 /*4796 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is4797 * a more complex suggestion for fixing these reuse issues in RFC16444798 * but not yet ready for general use. Also see RFC1379.4799 */4800
4801 #defineBSD_TIME_WAIT4802 #ifdefBSD_TIME_WAIT4803 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4804 after(th->seq, sk->acked_seq) && !th->rst)
4805 {4806 u32seq = sk->write_seq;
4807 if(sk->debug)
4808 printk("Doing a BSD time wait\n");
4809 tcp_statistics.TcpEstabResets++;
4810 sk->rmem_alloc -= skb->truesize;
4811 skb->sk = NULL;
4812 sk->err=ECONNRESET;
4813 tcp_set_state(sk, TCP_CLOSE);
4814 sk->shutdown = SHUTDOWN_MASK;
4815 release_sock(sk);
4816 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4817 if (sk && sk->state==TCP_LISTEN)
4818 {4819 sk->inuse=1;
4820 skb->sk = sk;
4821 sk->rmem_alloc += skb->truesize;
4822 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4823 release_sock(sk);
4824 return 0;
4825 }4826 kfree_skb(skb, FREE_READ);
4827 return 0;
4828 }4829 #endif4830 }4831
4832 /*4833 * We are now in normal data flow (see the step list in the RFC)4834 * Note most of these are inline now. I'll inline the lot when4835 * I have time to test it hard and look at what gcc outputs 4836 */4837
4838 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4839 {4840 kfree_skb(skb, FREE_READ);
4841 release_sock(sk);
4842 return 0;
4843 }4844
4845 if(th->rst)
4846 returntcp_std_reset(sk,skb);
4847
4848 /*4849 * !syn_ok is effectively the state test in RFC793.4850 */4851
4852 if(th->syn && !syn_ok)
4853 {4854 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4855 returntcp_std_reset(sk,skb);
4856 }4857
4858 /*4859 * Process the ACK4860 */4861
4862
4863 if(th->ack && !tcp_ack(sk,th,saddr,len))
4864 {4865 /*4866 * Our three way handshake failed.4867 */4868
4869 if(sk->state==TCP_SYN_RECV)
4870 {4871 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4872 }4873 kfree_skb(skb, FREE_READ);
4874 release_sock(sk);
4875 return 0;
4876 }4877
4878 rfc_step6: /* I'll clean this up later */4879
4880 /*4881 * Process urgent data4882 */4883
4884 if(tcp_urg(sk, th, saddr, len))
4885 {4886 kfree_skb(skb, FREE_READ);
4887 release_sock(sk);
4888 return 0;
4889 }4890
4891
4892 /*4893 * Process the encapsulated data4894 */4895
4896 if(tcp_data(skb,sk, saddr, len))
4897 {4898 kfree_skb(skb, FREE_READ);
4899 release_sock(sk);
4900 return 0;
4901 }4902
4903 /*4904 * And done4905 */4906
4907 release_sock(sk);
4908 return 0;
4909 }4910
4911 /*4912 * This routine sends a packet with an out of date sequence4913 * number. It assumes the other end will try to ack it.4914 */4915
4916 staticvoidtcp_write_wakeup(structsock *sk)
/* */4917 {4918 structsk_buff *buff,*skb;
4919 structtcphdr *t1;
4920 structdevice *dev=NULL;
4921 inttmp;
4922
4923 if (sk->zapped)
4924 return; /* After a valid reset we can send no more */4925
4926 /*4927 * Write data can still be transmitted/retransmitted in the4928 * following states. If any other state is encountered, return.4929 * [listen/close will never occur here anyway]4930 */4931
4932 if (sk->state != TCP_ESTABLISHED &&
4933 sk->state != TCP_CLOSE_WAIT &&
4934 sk->state != TCP_FIN_WAIT1 &&
4935 sk->state != TCP_LAST_ACK &&
4936 sk->state != TCP_CLOSING4937 )
4938 {4939 return;
4940 }4941 if ( before(sk->sent_seq, sk->window_seq) &&
4942 (skb=skb_peek(&sk->write_queue)))
4943 {4944 /*4945 * We are probing the opening of a window4946 * but the window size is != 04947 * must have been a result SWS advoidance ( sender )4948 */4949
4950 structiphdr *iph;
4951 structtcphdr *th;
4952 structtcphdr *nth;
4953 unsignedlongwin_size, ow_size;
4954 void * tcp_data_start;
4955
4956 /*4957 * How many bytes can we send ?4958 */4959
4960 win_size = sk->window_seq - sk->sent_seq;
4961
4962 /*4963 * Recover the buffer pointers4964 */4965
4966 iph = (structiphdr *)skb->ip_hdr;
4967 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
4968
4969 /*4970 * Grab the data for a temporary frame4971 */4972
4973 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 +
4974 (iph->ihl << 2) +
4975 sk->prot->max_header + 15,
4976 1, GFP_ATOMIC);
4977 if ( buff == NULL )
4978 return;
4979
4980 /* 4981 * If we strip the packet on the write queue we must4982 * be ready to retransmit this one 4983 */4984
4985 buff->free = /*0*/1;
4986
4987 buff->sk = sk;
4988 buff->localroute = sk->localroute;
4989
4990 /*4991 * Put headers on the new packet4992 */4993
4994 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4995 IPPROTO_TCP, sk->opt, buff->truesize,
4996 sk->ip_tos,sk->ip_ttl);
4997 if (tmp < 0)
4998 {4999 sk->prot->wfree(sk, buff);
5000 return;
5001 }5002
5003 /*5004 * Move the TCP header over5005 */5006
5007 buff->dev = dev;
5008
5009 nth = (structtcphdr *) skb_put(buff,th->doff*4);
5010
5011 memcpy(nth, th, th->doff * 4);
5012
5013 /*5014 * Correct the new header5015 */5016
5017 nth->ack = 1;
5018 nth->ack_seq = ntohl(sk->acked_seq);
5019 nth->window = ntohs(tcp_select_window(sk));
5020 nth->check = 0;
5021
5022 /*5023 * Find the first data byte.5024 */5025
5026 tcp_data_start = skb->data + skb->dev->hard_header_len +
5027 (iph->ihl << 2) + th->doff * 4;
5028
5029 /*5030 * Add it to our new buffer5031 */5032 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5033
5034 /*5035 * Remember our right edge sequence number.5036 */5037
5038 buff->h.seq = sk->sent_seq + win_size;
5039 sk->sent_seq = buff->h.seq; /* Hack */5040 #if 0
5041
5042 /*5043 * now: shrink the queue head segment 5044 */5045
5046 th->check = 0;
5047 ow_size = skb->len - win_size -
5048 ((unsignedlong) (tcp_data_start - (void *) skb->data));
5049
5050 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5051 skb_trim(skb,skb->len-win_size);
5052 sk->sent_seq += win_size;
5053 th->seq = htonl(sk->sent_seq);
5054 if (th->urg)
5055 {5056 unsignedshorturg_ptr;
5057
5058 urg_ptr = ntohs(th->urg_ptr);
5059 if (urg_ptr <= win_size)
5060 th->urg = 0;
5061 else5062 {5063 urg_ptr -= win_size;
5064 th->urg_ptr = htons(urg_ptr);
5065 nth->urg_ptr = htons(win_size);
5066 }5067 }5068 #else5069 if(th->urg && ntohs(th->urg_ptr) < win_size)
5070 nth->urg = 0;
5071 #endif5072
5073 /*5074 * Checksum the split buffer5075 */5076
5077 tcp_send_check(nth, sk->saddr, sk->daddr,
5078 nth->doff * 4 + win_size , sk);
5079 }5080 else5081 {5082 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5083 if (buff == NULL)
5084 return;
5085
5086 buff->free = 1;
5087 buff->sk = sk;
5088 buff->localroute = sk->localroute;
5089
5090 /*5091 * Put in the IP header and routing stuff. 5092 */5093
5094 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5095 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5096 if (tmp < 0)
5097 {5098 sk->prot->wfree(sk, buff);
5099 return;
5100 }5101
5102 t1 = (structtcphdr *)skb_put(buff,sizeof(structtcphdr));
5103 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5104
5105 /*5106 * Use a previous sequence.5107 * This should cause the other end to send an ack.5108 */5109
5110 t1->seq = htonl(sk->sent_seq-1);
5111 t1->ack = 1;
5112 t1->res1= 0;
5113 t1->res2= 0;
5114 t1->rst = 0;
5115 t1->urg = 0;
5116 t1->psh = 0;
5117 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5118 t1->syn = 0;
5119 t1->ack_seq = ntohl(sk->acked_seq);
5120 t1->window = ntohs(tcp_select_window(sk));
5121 t1->doff = sizeof(*t1)/4;
5122 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5123
5124 }5125
5126 /*5127 * Send it.5128 */5129
5130 sk->prot->queue_xmit(sk, dev, buff, 1);
5131 tcp_statistics.TcpOutSegs++;
5132 }5133
5134 /*5135 * A window probe timeout has occurred.5136 */5137
5138 voidtcp_send_probe0(structsock *sk)
/* */5139 {5140 if (sk->zapped)
5141 return; /* After a valid reset we can send no more */5142
5143 tcp_write_wakeup(sk);
5144
5145 sk->backoff++;
5146 sk->rto = min(sk->rto << 1, 120*HZ);
5147 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5148 sk->retransmits++;
5149 sk->prot->retransmits ++;
5150 }5151
5152 /*5153 * Socket option code for TCP. 5154 */5155
5156 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5157 {5158 intval,err;
5159
5160 if(level!=SOL_TCP)
5161 returnip_setsockopt(sk,level,optname,optval,optlen);
5162
5163 if (optval == NULL)
5164 return(-EINVAL);
5165
5166 err=verify_area(VERIFY_READ, optval, sizeof(int));
5167 if(err)
5168 returnerr;
5169
5170 val = get_user((int *)optval);
5171
5172 switch(optname)
5173 {5174 caseTCP_MAXSEG:
5175 /*5176 * values greater than interface MTU won't take effect. however at5177 * the point when this call is done we typically don't yet know5178 * which interface is going to be used5179 */5180 if(val<1||val>MAX_WINDOW)
5181 return -EINVAL;
5182 sk->user_mss=val;
5183 return 0;
5184 caseTCP_NODELAY:
5185 sk->nonagle=(val==0)?0:1;
5186 return 0;
5187 default:
5188 return(-ENOPROTOOPT);
5189 }5190 }5191
5192 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5193 {5194 intval,err;
5195
5196 if(level!=SOL_TCP)
5197 returnip_getsockopt(sk,level,optname,optval,optlen);
5198
5199 switch(optname)
5200 {5201 caseTCP_MAXSEG:
5202 val=sk->user_mss;
5203 break;
5204 caseTCP_NODELAY:
5205 val=sk->nonagle;
5206 break;
5207 default:
5208 return(-ENOPROTOOPT);
5209 }5210 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5211 if(err)
5212 returnerr;
5213 put_user(sizeof(int),(int *) optlen);
5214
5215 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5216 if(err)
5217 returnerr;
5218 put_user(val,(int *)optval);
5219
5220 return(0);
5221 }5222
5223
5224 structprototcp_prot = {5225 sock_wmalloc,
5226 sock_rmalloc,
5227 sock_wfree,
5228 sock_rfree,
5229 sock_rspace,
5230 sock_wspace,
5231 tcp_close,
5232 tcp_read,
5233 tcp_write,
5234 tcp_sendto,
5235 tcp_recvfrom,
5236 ip_build_header,
5237 tcp_connect,
5238 tcp_accept,
5239 ip_queue_xmit,
5240 tcp_retransmit,
5241 tcp_write_wakeup,
5242 tcp_read_wakeup,
5243 tcp_rcv,
5244 tcp_select,
5245 tcp_ioctl,
5246 NULL,
5247 tcp_shutdown,
5248 tcp_setsockopt,
5249 tcp_getsockopt,
5250 128,
5251 0,
5252 "TCP",
5253 0, 0,
5254 {NULL,}5255 };