1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 26 * and was trying to connect (tcp_err()). 27 * Alan Cox : All icmp error handling was broken 28 * pointers passed where wrong and the 29 * socket was looked up backwards. Nobody 30 * tested any icmp error code obviously. 31 * Alan Cox : tcp_err() now handled properly. It wakes people 32 * on errors. select behaves and the icmp error race 33 * has gone by moving it into sock.c 34 * Alan Cox : tcp_reset() fixed to work for everything not just 35 * packets for unknown sockets. 36 * Alan Cox : tcp option processing. 37 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] 38 * Herp Rosmanith : More reset fixes 39 * Alan Cox : No longer acks invalid rst frames. Acking 40 * any kind of RST is right out. 41 * Alan Cox : Sets an ignore me flag on an rst receive 42 * otherwise odd bits of prattle escape still 43 * Alan Cox : Fixed another acking RST frame bug. Should stop 44 * LAN workplace lockups. 45 * Alan Cox : Some tidyups using the new skb list facilities 46 * Alan Cox : sk->keepopen now seems to work 47 * Alan Cox : Pulls options out correctly on accepts 48 * Alan Cox : Fixed assorted sk->rqueue->next errors 49 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. 50 * Alan Cox : Tidied tcp_data to avoid a potential nasty. 51 * Alan Cox : Added some better commenting, as the tcp is hard to follow 52 * Alan Cox : Removed incorrect check for 20 * psh 53 * Michael O'Reilly : ack < copied bug fix. 54 * Johannes Stille : Misc tcp fixes (not all in yet). 55 * Alan Cox : FIN with no memory -> CRASH 56 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. 57 * Alan Cox : Added TCP options (SOL_TCP) 58 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. 59 * Alan Cox : Use ip_tos/ip_ttl settings. 60 * Alan Cox : Handle FIN (more) properly (we hope). 61 * Alan Cox : RST frames sent on unsynchronised state ack error/ 62 * Alan Cox : Put in missing check for SYN bit. 63 * Alan Cox : Added tcp_select_window() aka NET2E 64 * window non shrink trick. 65 * Alan Cox : Added a couple of small NET2E timer fixes 66 * Charles Hedrick : TCP fixes 67 * Toomas Tamm : TCP window fixes 68 * Alan Cox : Small URG fix to rlogin ^C ack fight 69 * Charles Hedrick : Rewrote most of it to actually work 70 * Linus : Rewrote tcp_read() and URG handling 71 * completely 72 * Gerhard Koerting: Fixed some missing timer handling 73 * Matthew Dillon : Reworked TCP machine states as per RFC 74 * Gerhard Koerting: PC/TCP workarounds 75 * Adam Caldwell : Assorted timer/timing errors 76 * Matthew Dillon : Fixed another RST bug 77 * Alan Cox : Move to kernel side addressing changes. 78 * Alan Cox : Beginning work on TCP fastpathing (not yet usable) 79 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 80 * Alan Cox : TCP fast path debugging 81 * Alan Cox : Window clamping 82 * Michael Riepe : Bug in tcp_check() 83 * Matt Dillon : More TCP improvements and RST bug fixes 84 * Matt Dillon : Yet more small nasties remove from the TCP code 85 * (Be very nice to this man if tcp finally works 100%) 8) 86 * Alan Cox : BSD accept semantics. 87 * Alan Cox : Reset on closedown bug. 88 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 89 * Michael Pall : Handle select() after URG properly in all cases. 90 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). 91 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. 92 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. 93 * Alan Cox : Changed the semantics of sk->socket to 94 * fix a race and a signal problem with 95 * accept() and async I/O. 96 * Alan Cox : Relaxed the rules on tcp_sendto(). 97 * Yury Shevchuk : Really fixed accept() blocking problem. 98 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 99 * clients/servers which listen in on 100 * fixed ports. 101 * Alan Cox : Cleaned the above up and shrank it to 102 * a sensible code size. 103 * Alan Cox : Self connect lockup fix. 104 * Alan Cox : No connect to multicast. 105 * Ross Biro : Close unaccepted children on master 106 * socket close. 107 * Alan Cox : Reset tracing code. 108 * Alan Cox : Spurious resets on shutdown. 109 * Alan Cox : Giant 15 minute/60 second timer error 110 * Alan Cox : Small whoops in selecting before an accept. 111 * Alan Cox : Kept the state trace facility since it's 112 * handy for debugging. 113 * Alan Cox : More reset handler fixes. 114 * Alan Cox : Started rewriting the code based on the RFC's 115 * for other useful protocol references see: 116 * Comer, KA9Q NOS, and for a reference on the 117 * difference between specifications and how BSD 118 * works see the 4.4lite source. 119 * A.N.Kuznetsov : Don't time wait on completion of tidy 120 * close. 121 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 122 * Linus Torvalds : Fixed BSD port reuse to work first syn 123 * Alan Cox : Reimplemented timers as per the RFC and using multiple 124 * timers for sanity. 125 * Alan Cox : Small bug fixes, and a lot of new 126 * comments. 127 * Alan Cox : Fixed dual reader crash by locking 128 * the buffers (much like datagram.c) 129 * Alan Cox : Fixed stuck sockets in probe. A probe 130 * now gets fed up of retrying without 131 * (even a no space) answer. 132 * Alan Cox : Extracted closing code better 133 * Alan Cox : Fixed the closing state machine to 134 * resemble the RFC. 135 * Alan Cox : More 'per spec' fixes. 136 * Jorge Cwik : Even faster checksumming. 137 * Alan Cox : tcp_data() doesn't ack illegal PSH 138 * only frames. At least one pc tcp stack 139 * generates them. 140 * Alan Cox : Cache last socket. 141 * Alan Cox : Per route irtt. 142 * Matt Day : Select() match BSD precisely on error 143 * Alan Cox : New buffers 144 * Mark Tamsky : Various sk->prot->retransmits and 145 * sk->retransmits misupdating fixed. 146 * Fixed tcp_write_timeout: stuck close, 147 * and TCP syn retries gets used now. 148 * 149 * 150 * To Fix: 151 * Fast path the code. Two things here - fix the window calculation 152 * so it doesn't iterate over the queue, also spot packets with no funny 153 * options arriving in order and process directly. 154 * 155 * Implement RFC 1191 [Path MTU discovery] 156 * Look at the effect of implementing RFC 1337 suggestions and their impact. 157 * Rewrite output state machine to use a single queue and do low window 158 * situations as per the spec (RFC 1122) 159 * Speed up input assembly algorithm. 160 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 161 * could do with it working on IPv4 162 * User settable/learned rtt/max window/mtu 163 * Cope with MTU/device switches when retransmitting in tcp. 164 * Fix the window handling to use PR's new code. 165 * 166 * Change the fundamental structure to a single send queue maintained 167 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 168 * active routes too]). Cut the queue off in tcp_retransmit/ 169 * tcp_transmit. 170 * Change the receive queue to assemble as it goes. This lets us 171 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 172 * tcp_data/tcp_read as well as the window shrink crud. 173 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 174 * tcp_queue_skb seem obvious routines to extract. 175 * 176 * This program is free software; you can redistribute it and/or 177 * modify it under the terms of the GNU General Public License 178 * as published by the Free Software Foundation; either version 179 * 2 of the License, or(at your option) any later version. 180 * 181 * Description of States: 182 * 183 * TCP_SYN_SENT sent a connection request, waiting for ack 184 * 185 * TCP_SYN_RECV received a connection request, sent ack, 186 * waiting for final ack in three-way handshake. 187 * 188 * TCP_ESTABLISHED connection established 189 * 190 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 191 * transmission of remaining buffered data 192 * 193 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 194 * to shutdown 195 * 196 * TCP_CLOSING both sides have shutdown but we still have 197 * data we have to finish sending 198 * 199 * TCP_TIME_WAIT timeout to catch resent junk before entering 200 * closed, can only be entered from FIN_WAIT2 201 * or CLOSING. Required because the other end 202 * may not have gotten our last ACK causing it 203 * to retransmit the data packet (which we ignore) 204 * 205 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 206 * us to finish writing our data and to shutdown 207 * (we have to close() to move on to LAST_ACK) 208 * 209 * TCP_LAST_ACK out side has shutdown after remote has 210 * shutdown. There may still be data in our 211 * buffer that we have to finish sending 212 * 213 * TCP_CLOSE socket is finished 214 */ 215
216 #include <linux/types.h>
217 #include <linux/sched.h>
218 #include <linux/mm.h>
219 #include <linux/time.h>
220 #include <linux/string.h>
221 #include <linux/config.h>
222 #include <linux/socket.h>
223 #include <linux/sockios.h>
224 #include <linux/termios.h>
225 #include <linux/in.h>
226 #include <linux/fcntl.h>
227 #include <linux/inet.h>
228 #include <linux/netdevice.h>
229 #include <net/snmp.h>
230 #include <net/ip.h>
231 #include <net/protocol.h>
232 #include <net/icmp.h>
233 #include <net/tcp.h>
234 #include <net/arp.h>
235 #include <linux/skbuff.h>
236 #include <net/sock.h>
237 #include <net/route.h>
238 #include <linux/errno.h>
239 #include <linux/timer.h>
240 #include <asm/system.h>
241 #include <asm/segment.h>
242 #include <linux/mm.h>
243 #include <net/checksum.h>
244
245 /* 246 * The MSL timer is the 'normal' timer. 247 */ 248
249 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
250
251 #define SEQ_TICK 3
252 unsignedlongseq_offset;
253 structtcp_mibtcp_statistics;
254
255 /* 256 * Cached last hit socket 257 */ 258
259 volatileunsignedlongth_cache_saddr,th_cache_daddr;
260 volatileunsignedshortth_cache_dport, th_cache_sport;
261 volatilestructsock *th_cache_sk;
262
263 voidtcp_cache_zap(void)
/* */ 264 { 265 unsignedlongflags;
266 save_flags(flags);
267 cli();
268 th_cache_saddr=0;
269 th_cache_daddr=0;
270 th_cache_dport=0;
271 th_cache_sport=0;
272 th_cache_sk=NULL;
273 restore_flags(flags);
274 } 275
276 staticvoidtcp_close(structsock *sk, inttimeout);
277
278
279 /* 280 * The less said about this the better, but it works and will do for 1.2 281 */ 282
283 staticstructwait_queue *master_select_wakeup;
284
285 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 286 { 287 if (a < b)
288 return(a);
289 return(b);
290 } 291
292 #undefSTATE_TRACE 293
294 #ifdefSTATE_TRACE 295 staticchar *statename[]={ 296 "Unused","Established","Syn Sent","Syn Recv",
297 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
298 "Close Wait","Last ACK","Listen","Closing"
299 };
300 #endif 301
302 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 303 { 304 if(sk->state==TCP_ESTABLISHED)
305 tcp_statistics.TcpCurrEstab--;
306 #ifdefSTATE_TRACE 307 if(sk->debug)
308 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
309 #endif 310 /* This is a hack but it doesn't occur often and it's going to 311 be a real to fix nicely */ 312
313 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
314 { 315 wake_up_interruptible(&master_select_wakeup);
316 } 317 sk->state=state;
318 if(state==TCP_ESTABLISHED)
319 tcp_statistics.TcpCurrEstab++;
320 } 321
322 /* 323 * This routine picks a TCP windows for a socket based on 324 * the following constraints 325 * 326 * 1. The window can never be shrunk once it is offered (RFC 793) 327 * 2. We limit memory per socket 328 * 329 * For now we use NET2E3's heuristic of offering half the memory 330 * we have handy. All is not as bad as this seems however because 331 * of two things. Firstly we will bin packets even within the window 332 * in order to get the data we are waiting for into the memory limit. 333 * Secondly we bin common duplicate forms at receive time 334 * Better heuristics welcome 335 */ 336
337 inttcp_select_window(structsock *sk)
/* */ 338 { 339 intnew_window = sk->prot->rspace(sk);
340
341 if(sk->window_clamp)
342 new_window=min(sk->window_clamp,new_window);
343 /* 344 * Two things are going on here. First, we don't ever offer a 345 * window less than min(sk->mss, MAX_WINDOW/2). This is the 346 * receiver side of SWS as specified in RFC1122. 347 * Second, we always give them at least the window they 348 * had before, in order to avoid retracting window. This 349 * is technically allowed, but RFC1122 advises against it and 350 * in practice it causes trouble. 351 * 352 * Fixme: This doesn't correctly handle the case where 353 * new_window > sk->window but not by enough to allow for the 354 * shift in sequence space. 355 */ 356 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
357 return(sk->window);
358 return(new_window);
359 } 360
361 /* 362 * Find someone to 'accept'. Must be called with 363 * sk->inuse=1 or cli() 364 */ 365
366 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 367 { 368 structsk_buff *p=skb_peek(&s->receive_queue);
369 if(p==NULL)
370 returnNULL;
371 do 372 { 373 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
374 returnp;
375 p=p->next;
376 } 377 while(p!=(structsk_buff *)&s->receive_queue);
378 returnNULL;
379 } 380
381 /* 382 * Remove a completed connection and return it. This is used by 383 * tcp_accept() to get connections from the queue. 384 */ 385
386 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 387 { 388 structsk_buff *skb;
389 unsignedlongflags;
390 save_flags(flags);
391 cli();
392 skb=tcp_find_established(s);
393 if(skb!=NULL)
394 skb_unlink(skb); /* Take it off the queue */ 395 restore_flags(flags);
396 returnskb;
397 } 398
399 /* 400 * This routine closes sockets which have been at least partially 401 * opened, but not yet accepted. Currently it is only called by 402 * tcp_close, and timeout mirrors the value there. 403 */ 404
405 staticvoidtcp_close_pending (structsock *sk)
/* */ 406 { 407 structsk_buff *skb;
408
409 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
410 { 411 skb->sk->dead=1;
412 tcp_close(skb->sk, 0);
413 kfree_skb(skb, FREE_READ);
414 } 415 return;
416 } 417
418 /* 419 * Enter the time wait state. 420 */ 421
422 staticvoidtcp_time_wait(structsock *sk)
/* */ 423 { 424 tcp_set_state(sk,TCP_TIME_WAIT);
425 sk->shutdown = SHUTDOWN_MASK;
426 if (!sk->dead)
427 sk->state_change(sk);
428 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
429 } 430
431 /* 432 * A socket has timed out on its send queue and wants to do a 433 * little retransmitting. Currently this means TCP. 434 */ 435
436 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 437 { 438 structsk_buff * skb;
439 structproto *prot;
440 structdevice *dev;
441 intct=0;
442
443 prot = sk->prot;
444 skb = sk->send_head;
445
446 while (skb != NULL)
447 { 448 structtcphdr *th;
449 structiphdr *iph;
450 intsize;
451
452 dev = skb->dev;
453 IS_SKB(skb);
454 skb->when = jiffies;
455
456 /* 457 * In general it's OK just to use the old packet. However we 458 * need to use the current ack and window fields. Urg and 459 * urg_ptr could possibly stand to be updated as well, but we 460 * don't keep the necessary data. That shouldn't be a problem, 461 * if the other end is doing the right thing. Since we're 462 * changing the packet, we have to issue a new IP identifier. 463 */ 464
465 iph = (structiphdr *)(skb->data + dev->hard_header_len);
466 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
467 size = skb->len - (((unsignedchar *) th) - skb->data);
468
469 /* 470 * Note: We ought to check for window limits here but 471 * currently this is done (less efficiently) elsewhere. 472 * We do need to check for a route change but can't handle 473 * that until we have the new 1.3.x buffers in. 474 * 475 */ 476
477 iph->id = htons(ip_id_count++);
478 ip_send_check(iph);
479
480 /* 481 * This is not the right way to handle this. We have to 482 * issue an up to date window and ack report with this 483 * retransmit to keep the odd buggy tcp that relies on 484 * the fact BSD does this happy. 485 * We don't however need to recalculate the entire 486 * checksum, so someone wanting a small problem to play 487 * with might like to implement RFC1141/RFC1624 and speed 488 * this up by avoiding a full checksum. 489 */ 490
491 th->ack_seq = ntohl(sk->acked_seq);
492 th->window = ntohs(tcp_select_window(sk));
493 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
494
495 /* 496 * If the interface is (still) up and running, kick it. 497 */ 498
499 if (dev->flags & IFF_UP)
500 { 501 /* 502 * If the packet is still being sent by the device/protocol 503 * below then don't retransmit. This is both needed, and good - 504 * especially with connected mode AX.25 where it stops resends 505 * occurring of an as yet unsent anyway frame! 506 * We still add up the counts as the round trip time wants 507 * adjusting. 508 */ 509 if (sk && !skb_device_locked(skb))
510 { 511 /* Remove it from any existing driver queue first! */ 512 skb_unlink(skb);
513 /* Now queue it */ 514 ip_statistics.IpOutRequests++;
515 dev_queue_xmit(skb, dev, sk->priority);
516 } 517 } 518
519 /* 520 * Count retransmissions 521 */ 522
523 ct++;
524 sk->prot->retransmits ++;
525 tcp_statistics.TcpRetransSegs++;
526
527
528 /* 529 * Only one retransmit requested. 530 */ 531
532 if (!all)
533 break;
534
535 /* 536 * This should cut it off before we send too many packets. 537 */ 538
539 if (ct >= sk->cong_window)
540 break;
541 skb = skb->link3;
542 } 543 } 544
545 /* 546 * Reset the retransmission timer 547 */ 548
549 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 550 { 551 del_timer(&sk->retransmit_timer);
552 sk->ip_xmit_timeout = why;
553 if((int)when < 0)
554 { 555 when=3;
556 printk("Error: Negative timer in xmit_timer\n");
557 } 558 sk->retransmit_timer.expires=when;
559 add_timer(&sk->retransmit_timer);
560 } 561
562 /* 563 * This is the normal code called for timeouts. It does the retransmission 564 * and then does backoff. tcp_do_retransmit is separated out because 565 * tcp_ack needs to send stuff from the retransmit queue without 566 * initiating a backoff. 567 */ 568
569
570 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 571 { 572 tcp_do_retransmit(sk, all);
573
574 /* 575 * Increase the timeout each time we retransmit. Note that 576 * we do not increase the rtt estimate. rto is initialized 577 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 578 * that doubling rto each time is the least we can get away with. 579 * In KA9Q, Karn uses this for the first few times, and then 580 * goes to quadratic. netBSD doubles, but only goes up to *64, 581 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 582 * defined in the protocol as the maximum possible RTT. I guess 583 * we'll have to use something other than TCP to talk to the 584 * University of Mars. 585 * 586 * PAWS allows us longer timeouts and large windows, so once 587 * implemented ftp to mars will work nicely. We will have to fix 588 * the 120 second clamps though! 589 */ 590
591 sk->retransmits++;
592 sk->prot->retransmits++;
593 sk->backoff++;
594 sk->rto = min(sk->rto << 1, 120*HZ);
595 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
596 } 597
598
599 /* 600 * A timer event has trigger a tcp retransmit timeout. The 601 * socket xmit queue is ready and set up to send. Because 602 * the ack receive code keeps the queue straight we do 603 * nothing clever here. 604 */ 605
606 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 607 { 608 if (all)
609 { 610 tcp_retransmit_time(sk, all);
611 return;
612 } 613
614 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 615 /* sk->ssthresh in theory can be zero. I guess that's OK */ 616 sk->cong_count = 0;
617
618 sk->cong_window = 1;
619
620 /* Do the actual retransmit. */ 621 tcp_retransmit_time(sk, all);
622 } 623
624 /* 625 * A write timeout has occurred. Process the after effects. 626 */ 627
628 staticinttcp_write_timeout(structsock *sk)
/* */ 629 { 630 /* 631 * Look for a 'soft' timeout. 632 */ 633 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
634 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
635 { 636 /* 637 * Attempt to recover if arp has changed (unlikely!) or 638 * a route has shifted (not supported prior to 1.3). 639 */ 640 arp_destroy (sk->daddr, 0);
641 /*ip_route_check (sk->daddr);*/ 642 } 643
644 /* 645 * Have we tried to SYN too many times (repent repent 8)) 646 */ 647
648 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
649 { 650 sk->err=ETIMEDOUT;
651 sk->error_report(sk);
652 del_timer(&sk->retransmit_timer);
653 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ 654 tcp_set_state(sk,TCP_CLOSE);
655 /* Don't FIN, we got nothing back */ 656 release_sock(sk);
657 return 0;
658 } 659 /* 660 * Has it gone just too far ? 661 */ 662 if (sk->retransmits > TCP_RETR2)
663 { 664 sk->err = ETIMEDOUT;
665 sk->error_report(sk);
666 del_timer(&sk->retransmit_timer);
667 /* 668 * Time wait the socket 669 */ 670 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
671 { 672 tcp_set_state(sk,TCP_TIME_WAIT);
673 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
674 } 675 else 676 { 677 /* 678 * Clean up time. 679 */ 680 tcp_set_state(sk, TCP_CLOSE);
681 release_sock(sk);
682 return 0;
683 } 684 } 685 return 1;
686 } 687
688 /* 689 * The TCP retransmit timer. This lacks a few small details. 690 * 691 * 1. An initial rtt timeout on the probe0 should cause what we can 692 * of the first write queue buffer to be split and sent. 693 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 694 * ETIMEDOUT if we know an additional 'soft' error caused this. 695 * tcp_err should save a 'soft error' for us. 696 */ 697
698 staticvoidretransmit_timer(unsignedlongdata)
/* */ 699 { 700 structsock *sk = (structsock*)data;
701 intwhy = sk->ip_xmit_timeout;
702
703 /* 704 * only process if socket is not in use 705 */ 706
707 cli();
708 if (sk->inuse || in_bh)
709 { 710 /* Try again in 1 second */ 711 sk->retransmit_timer.expires = HZ;
712 add_timer(&sk->retransmit_timer);
713 sti();
714 return;
715 } 716
717 sk->inuse = 1;
718 sti();
719
720 /* Always see if we need to send an ack. */ 721
722 if (sk->ack_backlog && !sk->zapped)
723 { 724 sk->prot->read_wakeup (sk);
725 if (! sk->dead)
726 sk->data_ready(sk,0);
727 } 728
729 /* Now we need to figure out why the socket was on the timer. */ 730
731 switch (why)
732 { 733 /* Window probing */ 734 caseTIME_PROBE0:
735 tcp_send_probe0(sk);
736 tcp_write_timeout(sk);
737 break;
738 /* Retransmitting */ 739 caseTIME_WRITE:
740 /* It could be we got here because we needed to send an ack. 741 * So we need to check for that. 742 */ 743 { 744 structsk_buff *skb;
745 unsignedlongflags;
746
747 save_flags(flags);
748 cli();
749 skb = sk->send_head;
750 if (!skb)
751 { 752 restore_flags(flags);
753 } 754 else 755 { 756 /* 757 * Kicked by a delayed ack. Reset timer 758 * correctly now 759 */ 760 if (jiffies < skb->when + sk->rto)
761 { 762 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
763 restore_flags(flags);
764 break;
765 } 766 restore_flags(flags);
767 /* 768 * Retransmission 769 */ 770 sk->prot->retransmit (sk, 0);
771 tcp_write_timeout(sk);
772 } 773 break;
774 } 775 /* Sending Keepalives */ 776 caseTIME_KEEPOPEN:
777 /* 778 * this reset_timer() call is a hack, this is not 779 * how KEEPOPEN is supposed to work. 780 */ 781 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
782
783 /* Send something to keep the connection open. */ 784 if (sk->prot->write_wakeup)
785 sk->prot->write_wakeup (sk);
786 sk->retransmits++;
787 sk->prot->retransmits++;
788 tcp_write_timeout(sk);
789 break;
790 default:
791 printk ("rexmit_timer: timer expired - reason unknown\n");
792 break;
793 } 794 release_sock(sk);
795 } 796
797 /* 798 * This routine is called by the ICMP module when it gets some 799 * sort of error condition. If err < 0 then the socket should 800 * be closed and the error returned to the user. If err > 0 801 * it's just the icmp type << 8 | icmp code. After adjustment 802 * header points to the first 8 bytes of the tcp header. We need 803 * to find the appropriate port. 804 */ 805
806 voidtcp_err(interr, unsignedchar *header, unsignedlongdaddr,
/* */ 807 unsignedlongsaddr, structinet_protocol *protocol)
808 { 809 structtcphdr *th;
810 structsock *sk;
811 structiphdr *iph=(structiphdr *)header;
812
813 header+=4*iph->ihl;
814
815
816 th =(structtcphdr *)header;
817 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
818
819 if (sk == NULL)
820 return;
821
822 if(err<0)
823 { 824 sk->err = -err;
825 sk->error_report(sk);
826 return;
827 } 828
829 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
830 { 831 /* 832 * FIXME: 833 * For now we will just trigger a linear backoff. 834 * The slow start code should cause a real backoff here. 835 */ 836 if (sk->cong_window > 4)
837 sk->cong_window--;
838 return;
839 } 840
841 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ 842
843 /* 844 * If we've already connected we will keep trying 845 * until we time out, or the user gives up. 846 */ 847
848 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
849 { 850 if (sk->state == TCP_SYN_SENT)
851 { 852 tcp_statistics.TcpAttemptFails++;
853 tcp_set_state(sk,TCP_CLOSE);
854 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 855 } 856 sk->err = icmp_err_convert[err & 0xff].errno;
857 } 858 return;
859 } 860
861
862 /* 863 * Walk down the receive queue counting readable data until we hit the end or we find a gap 864 * in the received data queue (ie a frame missing that needs sending to us). Not 865 * sorting using two queues as data arrives makes life so much harder. 866 */ 867
868 staticinttcp_readable(structsock *sk)
/* */ 869 { 870 unsignedlongcounted;
871 unsignedlongamount;
872 structsk_buff *skb;
873 intsum;
874 unsignedlongflags;
875
876 if(sk && sk->debug)
877 printk("tcp_readable: %p - ",sk);
878
879 save_flags(flags);
880 cli();
881 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
882 { 883 restore_flags(flags);
884 if(sk && sk->debug)
885 printk("empty\n");
886 return(0);
887 } 888
889 counted = sk->copied_seq; /* Where we are at the moment */ 890 amount = 0;
891
892 /* 893 * Do until a push or until we are out of data. 894 */ 895
896 do 897 { 898 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ 899 break;
900 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 901 if (skb->h.th->syn)
902 sum++;
903 if (sum > 0)
904 {/* Add it up, move on */ 905 amount += sum;
906 if (skb->h.th->syn)
907 amount--;
908 counted += sum;
909 } 910 /* 911 * Don't count urg data ... but do it in the right place! 912 * Consider: "old_data (ptr is here) URG PUSH data" 913 * The old code would stop at the first push because 914 * it counted the urg (amount==1) and then does amount-- 915 * *after* the loop. This means tcp_readable() always 916 * returned zero if any URG PUSH was in the queue, even 917 * though there was normal data available. If we subtract 918 * the urg data right here, we even get it to work for more 919 * than one URG PUSH skb without normal data. 920 * This means that select() finally works now with urg data 921 * in the queue. Note that rlogin was never affected 922 * because it doesn't use select(); it uses two processes 923 * and a blocking read(). And the queue scan in tcp_read() 924 * was correct. Mike <pall@rz.uni-karlsruhe.de> 925 */ 926 if (skb->h.th->urg)
927 amount--; /* don't count urg data */ 928 if (amount && skb->h.th->psh) break;
929 skb = skb->next;
930 } 931 while(skb != (structsk_buff *)&sk->receive_queue);
932
933 restore_flags(flags);
934 if(sk->debug)
935 printk("got %lu bytes.\n",amount);
936 return(amount);
937 } 938
939 /* 940 * LISTEN is a special case for select.. 941 */ 942 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 943 { 944 if (sel_type == SEL_IN) { 945 intretval;
946
947 sk->inuse = 1;
948 retval = (tcp_find_established(sk) != NULL);
949 release_sock(sk);
950 if (!retval)
951 select_wait(&master_select_wakeup,wait);
952 returnretval;
953 } 954 return 0;
955 } 956
957
958 /* 959 * Wait for a TCP event. 960 * 961 * Note that we don't need to set "sk->inuse", as the upper select layers 962 * take care of normal races (between the test and the event) and we don't 963 * go look at any of the socket buffers directly. 964 */ 965 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 966 { 967 if (sk->state == TCP_LISTEN)
968 returntcp_listen_select(sk, sel_type, wait);
969
970 switch(sel_type) { 971 caseSEL_IN:
972 if (sk->err)
973 return 1;
974 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
975 break;
976
977 if (sk->shutdown & RCV_SHUTDOWN)
978 return 1;
979
980 if (sk->acked_seq == sk->copied_seq)
981 break;
982
983 if (sk->urg_seq != sk->copied_seq ||
984 sk->acked_seq != sk->copied_seq+1 ||
985 sk->urginline || !sk->urg_data)
986 return 1;
987 break;
988
989 caseSEL_OUT:
990 if (sk->err)
991 return 1;
992 if (sk->shutdown & SEND_SHUTDOWN)
993 return 0;
994 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
995 break;
996 /* 997 * This is now right thanks to a small fix 998 * by Matt Dillon. 999 */1000
1001 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1002 break;
1003 return 1;
1004
1005 caseSEL_EX:
1006 if (sk->urg_data)
1007 return 1;
1008 break;
1009 }1010 select_wait(sk->sleep, wait);
1011 return 0;
1012 }1013
1014 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */1015 {1016 interr;
1017 switch(cmd)
1018 {1019
1020 caseTIOCINQ:
1021 #ifdef FIXME /* FIXME: */1022 caseFIONREAD:
1023 #endif1024 {1025 unsignedlongamount;
1026
1027 if (sk->state == TCP_LISTEN)
1028 return(-EINVAL);
1029
1030 sk->inuse = 1;
1031 amount = tcp_readable(sk);
1032 release_sock(sk);
1033 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1034 if(err)
1035 returnerr;
1036 put_user(amount, (int *)arg);
1037 return(0);
1038 }1039 caseSIOCATMARK:
1040 {1041 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1042
1043 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1044 if (err)
1045 returnerr;
1046 put_user(answ,(int *) arg);
1047 return(0);
1048 }1049 caseTIOCOUTQ:
1050 {1051 unsignedlongamount;
1052
1053 if (sk->state == TCP_LISTEN) return(-EINVAL);
1054 amount = sk->prot->wspace(sk);
1055 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1056 if(err)
1057 returnerr;
1058 put_user(amount, (int *)arg);
1059 return(0);
1060 }1061 default:
1062 return(-EINVAL);
1063 }1064 }1065
1066
1067 /*1068 * This routine computes a TCP checksum. 1069 *1070 * Modified January 1995 from a go-faster DOS routine by1071 * Jorge Cwik <jorge@laser.satlink.net>1072 */1073
1074 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1075 unsignedlongsaddr, unsignedlongdaddr, unsignedlongbase)
1076 {1077 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1078 }1079
1080
1081
1082 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1083 unsignedlongdaddr, intlen, structsock *sk)
1084 {1085 th->check = 0;
1086 th->check = tcp_check(th, len, saddr, daddr,
1087 csum_partial((char *)th,len,0));
1088 return;
1089 }1090
1091 /*1092 * This is the main buffer sending routine. We queue the buffer1093 * having checked it is sane seeming.1094 */1095
1096 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1097 {1098 intsize;
1099 structtcphdr * th = skb->h.th;
1100
1101 /*1102 * length of packet (not counting length of pre-tcp headers) 1103 */1104
1105 size = skb->len - ((unsignedchar *) th - skb->data);
1106
1107 /*1108 * Sanity check it.. 1109 */1110
1111 if (size < sizeof(structtcphdr) || size > skb->len)
1112 {1113 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1114 skb, skb->data, th, skb->len);
1115 kfree_skb(skb, FREE_WRITE);
1116 return;
1117 }1118
1119 /*1120 * If we have queued a header size packet.. (these crash a few1121 * tcp stacks if ack is not set)1122 */1123
1124 if (size == sizeof(structtcphdr))
1125 {1126 /* If it's got a syn or fin it's notionally included in the size..*/1127 if(!th->syn && !th->fin)
1128 {1129 printk("tcp_send_skb: attempt to queue a bogon.\n");
1130 kfree_skb(skb,FREE_WRITE);
1131 return;
1132 }1133 }1134
1135 /*1136 * Actual processing.1137 */1138
1139 tcp_statistics.TcpOutSegs++;
1140 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1141
1142 /*1143 * We must queue if1144 *1145 * a) The right edge of this frame exceeds the window1146 * b) We are retransmitting (Nagle's rule)1147 * c) We have too many packets 'in flight'1148 */1149
1150 if (after(skb->h.seq, sk->window_seq) ||
1151 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1152 sk->packets_out >= sk->cong_window)
1153 {1154 /* checksum will be supplied by tcp_write_xmit. So1155 * we shouldn't need to set it at all. I'm being paranoid */1156 th->check = 0;
1157 if (skb->next != NULL)
1158 {1159 printk("tcp_send_partial: next != NULL\n");
1160 skb_unlink(skb);
1161 }1162 skb_queue_tail(&sk->write_queue, skb);
1163
1164 /*1165 * If we don't fit we have to start the zero window1166 * probes. This is broken - we really need to do a partial1167 * send _first_ (This is what causes the Cisco and PC/TCP1168 * grief).1169 */1170
1171 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1172 sk->send_head == NULL && sk->ack_backlog == 0)
1173 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1174 }1175 else1176 {1177 /*1178 * This is going straight out1179 */1180
1181 th->ack_seq = ntohl(sk->acked_seq);
1182 th->window = ntohs(tcp_select_window(sk));
1183
1184 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1185
1186 sk->sent_seq = sk->write_seq;
1187
1188 /*1189 * This is mad. The tcp retransmit queue is put together1190 * by the ip layer. This causes half the problems with1191 * unroutable FIN's and other things.1192 */1193
1194 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1195
1196 /*1197 * Set for next retransmit based on expected ACK time.1198 * FIXME: We set this every time which means our 1199 * retransmits are really about a window behind.1200 */1201
1202 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1203 }1204 }1205
1206 /*1207 * Locking problems lead us to a messy situation where we can have1208 * multiple partially complete buffers queued up. This is really bad1209 * as we don't want to be sending partial buffers. Fix this with1210 * a semaphore or similar to lock tcp_write per socket.1211 *1212 * These routines are pretty self descriptive.1213 */1214
1215 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1216 {1217 structsk_buff * skb;
1218 unsignedlongflags;
1219
1220 save_flags(flags);
1221 cli();
1222 skb = sk->partial;
1223 if (skb) {1224 sk->partial = NULL;
1225 del_timer(&sk->partial_timer);
1226 }1227 restore_flags(flags);
1228 returnskb;
1229 }1230
1231 /*1232 * Empty the partial queue1233 */1234
1235 staticvoidtcp_send_partial(structsock *sk)
/* */1236 {1237 structsk_buff *skb;
1238
1239 if (sk == NULL)
1240 return;
1241 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1242 tcp_send_skb(sk, skb);
1243 }1244
1245 /*1246 * Queue a partial frame1247 */1248
1249 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1250 {1251 structsk_buff * tmp;
1252 unsignedlongflags;
1253
1254 save_flags(flags);
1255 cli();
1256 tmp = sk->partial;
1257 if (tmp)
1258 del_timer(&sk->partial_timer);
1259 sk->partial = skb;
1260 init_timer(&sk->partial_timer);
1261 /*1262 * Wait up to 1 second for the buffer to fill.1263 */1264 sk->partial_timer.expires = HZ;
1265 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1266 sk->partial_timer.data = (unsignedlong) sk;
1267 add_timer(&sk->partial_timer);
1268 restore_flags(flags);
1269 if (tmp)
1270 tcp_send_skb(sk, tmp);
1271 }1272
1273
1274 /*1275 * This routine sends an ack and also updates the window. 1276 */1277
1278 staticvoidtcp_send_ack(u32sequence, u32ack,
/* */1279 structsock *sk,
1280 structtcphdr *th, unsignedlongdaddr)
1281 {1282 structsk_buff *buff;
1283 structtcphdr *t1;
1284 structdevice *dev = NULL;
1285 inttmp;
1286
1287 if(sk->zapped)
1288 return; /* We have been reset, we may not send again */1289
1290 /*1291 * We need to grab some memory, and put together an ack,1292 * and then put it into the queue to be sent.1293 */1294
1295 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1296 if (buff == NULL)
1297 {1298 /* 1299 * Force it to send an ack. We don't have to do this1300 * (ACK is unreliable) but it's much better use of 1301 * bandwidth on slow links to send a spare ack than1302 * resend packets. 1303 */1304
1305 sk->ack_backlog++;
1306 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1307 {1308 reset_xmit_timer(sk, TIME_WRITE, HZ);
1309 }1310 return;
1311 }1312
1313 /*1314 * Assemble a suitable TCP frame1315 */1316
1317 buff->sk = sk;
1318 buff->localroute = sk->localroute;
1319
1320 /* 1321 * Put in the IP header and routing stuff. 1322 */1323
1324 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1325 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1326 if (tmp < 0)
1327 {1328 buff->free = 1;
1329 sk->prot->wfree(sk, buff);
1330 return;
1331 }1332 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1333
1334 memcpy(t1, th, sizeof(*t1));
1335
1336 /*1337 * Swap the send and the receive. 1338 */1339
1340 t1->dest = th->source;
1341 t1->source = th->dest;
1342 t1->seq = ntohl(sequence);
1343 t1->ack = 1;
1344 sk->window = tcp_select_window(sk);
1345 t1->window = ntohs(sk->window);
1346 t1->res1 = 0;
1347 t1->res2 = 0;
1348 t1->rst = 0;
1349 t1->urg = 0;
1350 t1->syn = 0;
1351 t1->psh = 0;
1352 t1->fin = 0;
1353
1354 /*1355 * If we have nothing queued for transmit and the transmit timer1356 * is on we are just doing an ACK timeout and need to switch1357 * to a keepalive.1358 */1359
1360 if (ack == sk->acked_seq)
1361 {1362 sk->ack_backlog = 0;
1363 sk->bytes_rcv = 0;
1364 sk->ack_timed = 0;
1365 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1366 && sk->ip_xmit_timeout == TIME_WRITE)
1367 {1368 if(sk->keepopen) {1369 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1370 }else{1371 delete_timer(sk);
1372 }1373 }1374 }1375
1376 /*1377 * Fill in the packet and send it1378 */1379
1380 t1->ack_seq = ntohl(ack);
1381 t1->doff = sizeof(*t1)/4;
1382 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1383 if (sk->debug)
1384 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1385 tcp_statistics.TcpOutSegs++;
1386 sk->prot->queue_xmit(sk, dev, buff, 1);
1387 }1388
1389
1390 /* 1391 * This routine builds a generic TCP header. 1392 */1393
1394 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1395 {1396
1397 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1398 th->seq = htonl(sk->write_seq);
1399 th->psh =(push == 0) ? 1 : 0;
1400 th->doff = sizeof(*th)/4;
1401 th->ack = 1;
1402 th->fin = 0;
1403 sk->ack_backlog = 0;
1404 sk->bytes_rcv = 0;
1405 sk->ack_timed = 0;
1406 th->ack_seq = htonl(sk->acked_seq);
1407 sk->window = tcp_select_window(sk);
1408 th->window = htons(sk->window);
1409
1410 return(sizeof(*th));
1411 }1412
1413 /*1414 * This routine copies from a user buffer into a socket,1415 * and starts the transmit system.1416 */1417
1418 staticinttcp_write(structsock *sk, unsignedchar *from,
/* */1419 intlen, intnonblock, unsignedflags)
1420 {1421 intcopied = 0;
1422 intcopy;
1423 inttmp;
1424 structsk_buff *skb;
1425 structsk_buff *send_tmp;
1426 structproto *prot;
1427 structdevice *dev = NULL;
1428
1429 sk->inuse=1;
1430 prot = sk->prot;
1431 while(len > 0)
1432 {1433 if (sk->err)
1434 {/* Stop on an error */1435 release_sock(sk);
1436 if (copied)
1437 return(copied);
1438 tmp = -sk->err;
1439 sk->err = 0;
1440 return(tmp);
1441 }1442
1443 /*1444 * First thing we do is make sure that we are established. 1445 */1446
1447 if (sk->shutdown & SEND_SHUTDOWN)
1448 {1449 release_sock(sk);
1450 sk->err = EPIPE;
1451 if (copied)
1452 return(copied);
1453 sk->err = 0;
1454 return(-EPIPE);
1455 }1456
1457 /* 1458 * Wait for a connection to finish.1459 */1460
1461 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1462 {1463 if (sk->err)
1464 {1465 release_sock(sk);
1466 if (copied)
1467 return(copied);
1468 tmp = -sk->err;
1469 sk->err = 0;
1470 return(tmp);
1471 }1472
1473 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1474 {1475 release_sock(sk);
1476 if (copied)
1477 return(copied);
1478
1479 if (sk->err)
1480 {1481 tmp = -sk->err;
1482 sk->err = 0;
1483 return(tmp);
1484 }1485
1486 if (sk->keepopen)
1487 {1488 send_sig(SIGPIPE, current, 0);
1489 }1490 return(-EPIPE);
1491 }1492
1493 if (nonblock || copied)
1494 {1495 release_sock(sk);
1496 if (copied)
1497 return(copied);
1498 return(-EAGAIN);
1499 }1500
1501 release_sock(sk);
1502 cli();
1503
1504 if (sk->state != TCP_ESTABLISHED &&
1505 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1506 {1507 interruptible_sleep_on(sk->sleep);
1508 if (current->signal & ~current->blocked)
1509 {1510 sti();
1511 if (copied)
1512 return(copied);
1513 return(-ERESTARTSYS);
1514 }1515 }1516 sk->inuse = 1;
1517 sti();
1518 }1519
1520 /*1521 * The following code can result in copy <= if sk->mss is ever1522 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1523 * sk->mtu is constant once SYN processing is finished. I.e. we1524 * had better not get here until we've seen his SYN and at least one1525 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1526 * But ESTABLISHED should guarantee that. sk->max_window is by definition1527 * non-decreasing. Note that any ioctl to set user_mss must be done1528 * before the exchange of SYN's. If the initial ack from the other1529 * end has a window of 0, max_window and thus mss will both be 0.1530 */1531
1532 /* 1533 * Now we need to check if we have a half built packet. 1534 */1535
1536 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1537 {1538 inthdrlen;
1539
1540 /* IP header + TCP header */1541 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1542 + sizeof(structtcphdr);
1543
1544 /* Add more stuff to the end of skb->len */1545 if (!(flags & MSG_OOB))
1546 {1547 copy = min(sk->mss - (skb->len - hdrlen), len);
1548 /* FIXME: this is really a bug. */1549 if (copy <= 0)
1550 {1551 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1552 copy = 0;
1553 }1554
1555 memcpy_fromfs(skb_put(skb,copy), from, copy);
1556 from += copy;
1557 copied += copy;
1558 len -= copy;
1559 sk->write_seq += copy;
1560 }1561 if ((skb->len - hdrlen) >= sk->mss ||
1562 (flags & MSG_OOB) || !sk->packets_out)
1563 tcp_send_skb(sk, skb);
1564 else1565 tcp_enqueue_partial(skb, sk);
1566 continue;
1567 }1568
1569 /*1570 * We also need to worry about the window.1571 * If window < 1/2 the maximum window we've seen from this1572 * host, don't use it. This is sender side1573 * silly window prevention, as specified in RFC1122.1574 * (Note that this is different than earlier versions of1575 * SWS prevention, e.g. RFC813.). What we actually do is 1576 * use the whole MSS. Since the results in the right1577 * edge of the packet being outside the window, it will1578 * be queued for later rather than sent.1579 */1580
1581 copy = sk->window_seq - sk->write_seq;
1582 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1583 copy = sk->mss;
1584 if (copy > len)
1585 copy = len;
1586
1587 /*1588 * We should really check the window here also. 1589 */1590
1591 send_tmp = NULL;
1592 if (copy < sk->mss && !(flags & MSG_OOB))
1593 {1594 /*1595 * We will release the socket in case we sleep here. 1596 */1597 release_sock(sk);
1598 /*1599 * NB: following must be mtu, because mss can be increased.1600 * mss is always <= mtu 1601 */1602 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1603 sk->inuse = 1;
1604 send_tmp = skb;
1605 }1606 else1607 {1608 /*1609 * We will release the socket in case we sleep here. 1610 */1611 release_sock(sk);
1612 skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1613 sk->inuse = 1;
1614 }1615
1616 /*1617 * If we didn't get any memory, we need to sleep. 1618 */1619
1620 if (skb == NULL)
1621 {1622 sk->socket->flags |= SO_NOSPACE;
1623 if (nonblock)
1624 {1625 release_sock(sk);
1626 if (copied)
1627 return(copied);
1628 return(-EAGAIN);
1629 }1630
1631 /*1632 * FIXME: here is another race condition. 1633 */1634
1635 tmp = sk->wmem_alloc;
1636 release_sock(sk);
1637 cli();
1638 /*1639 * Again we will try to avoid it. 1640 */1641 if (tmp <= sk->wmem_alloc &&
1642 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1643 && sk->err == 0)
1644 {1645 sk->socket->flags &= ~SO_NOSPACE;
1646 interruptible_sleep_on(sk->sleep);
1647 if (current->signal & ~current->blocked)
1648 {1649 sti();
1650 if (copied)
1651 return(copied);
1652 return(-ERESTARTSYS);
1653 }1654 }1655 sk->inuse = 1;
1656 sti();
1657 continue;
1658 }1659
1660 skb->sk = sk;
1661 skb->free = 0;
1662 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1663
1664 /*1665 * FIXME: we need to optimize this.1666 * Perhaps some hints here would be good.1667 */1668
1669 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1670 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1671 if (tmp < 0 )
1672 {1673 prot->wfree(sk, skb);
1674 release_sock(sk);
1675 if (copied)
1676 return(copied);
1677 return(tmp);
1678 }1679 skb->dev = dev;
1680 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
1681 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1682 if (tmp < 0)
1683 {1684 prot->wfree(sk, skb);
1685 release_sock(sk);
1686 if (copied)
1687 return(copied);
1688 return(tmp);
1689 }1690
1691 if (flags & MSG_OOB)
1692 {1693 skb->h.th->urg = 1;
1694 skb->h.th->urg_ptr = ntohs(copy);
1695 }1696
1697 memcpy_fromfs(skb_put(skb,copy), from, copy);
1698
1699 from += copy;
1700 copied += copy;
1701 len -= copy;
1702 skb->free = 0;
1703 sk->write_seq += copy;
1704
1705 if (send_tmp != NULL && sk->packets_out)
1706 {1707 tcp_enqueue_partial(send_tmp, sk);
1708 continue;
1709 }1710 tcp_send_skb(sk, skb);
1711 }1712 sk->err = 0;
1713
1714 /*1715 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1716 * interactive fast network servers. It's meant to be on and1717 * it really improves the throughput though not the echo time1718 * on my slow slip link - Alan1719 */1720
1721 /*1722 * Avoid possible race on send_tmp - c/o Johannes Stille 1723 */1724
1725 if(sk->partial && ((!sk->packets_out)
1726 /* If not nagling we can send on the before case too.. */1727 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1728 ))
1729 tcp_send_partial(sk);
1730
1731 release_sock(sk);
1732 return(copied);
1733 }1734
1735 /*1736 * This is just a wrapper. 1737 */1738
1739 staticinttcp_sendto(structsock *sk, unsignedchar *from,
/* */1740 intlen, intnonblock, unsignedflags,
1741 structsockaddr_in *addr, intaddr_len)
1742 {1743 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1744 return -EINVAL;
1745 if (sk->state == TCP_CLOSE)
1746 return -ENOTCONN;
1747 if (addr_len < sizeof(*addr))
1748 return -EINVAL;
1749 if (addr->sin_family && addr->sin_family != AF_INET)
1750 return -EINVAL;
1751 if (addr->sin_port != sk->dummy_th.dest)
1752 return -EISCONN;
1753 if (addr->sin_addr.s_addr != sk->daddr)
1754 return -EISCONN;
1755 returntcp_write(sk, from, len, nonblock, flags);
1756 }1757
1758
1759 /*1760 * Send an ack if one is backlogged at this point. Ought to merge1761 * this with tcp_send_ack().1762 */1763
1764 staticvoidtcp_read_wakeup(structsock *sk)
/* */1765 {1766 inttmp;
1767 structdevice *dev = NULL;
1768 structtcphdr *t1;
1769 structsk_buff *buff;
1770
1771 if (!sk->ack_backlog)
1772 return;
1773
1774 /*1775 * FIXME: we need to put code here to prevent this routine from1776 * being called. Being called once in a while is ok, so only check1777 * if this is the second time in a row.1778 */1779
1780 /*1781 * We need to grab some memory, and put together an ack,1782 * and then put it into the queue to be sent.1783 */1784
1785 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1786 if (buff == NULL)
1787 {1788 /* Try again real soon. */1789 reset_xmit_timer(sk, TIME_WRITE, HZ);
1790 return;
1791 }1792
1793 buff->sk = sk;
1794 buff->localroute = sk->localroute;
1795
1796 /*1797 * Put in the IP header and routing stuff. 1798 */1799
1800 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1801 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1802 if (tmp < 0)
1803 {1804 buff->free = 1;
1805 sk->prot->wfree(sk, buff);
1806 return;
1807 }1808
1809 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1810
1811 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1812 t1->seq = htonl(sk->sent_seq);
1813 t1->ack = 1;
1814 t1->res1 = 0;
1815 t1->res2 = 0;
1816 t1->rst = 0;
1817 t1->urg = 0;
1818 t1->syn = 0;
1819 t1->psh = 0;
1820 sk->ack_backlog = 0;
1821 sk->bytes_rcv = 0;
1822 sk->window = tcp_select_window(sk);
1823 t1->window = ntohs(sk->window);
1824 t1->ack_seq = ntohl(sk->acked_seq);
1825 t1->doff = sizeof(*t1)/4;
1826 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1827 sk->prot->queue_xmit(sk, dev, buff, 1);
1828 tcp_statistics.TcpOutSegs++;
1829 }1830
1831
1832 /*1833 * FIXME:1834 * This routine frees used buffers.1835 * It should consider sending an ACK to let the1836 * other end know we now have a bigger window.1837 */1838
1839 staticvoidcleanup_rbuf(structsock *sk)
/* */1840 {1841 unsignedlongflags;
1842 unsignedlongleft;
1843 structsk_buff *skb;
1844 unsignedlongrspace;
1845
1846 if(sk->debug)
1847 printk("cleaning rbuf for sk=%p\n", sk);
1848
1849 save_flags(flags);
1850 cli();
1851
1852 left = sk->prot->rspace(sk);
1853
1854 /*1855 * We have to loop through all the buffer headers,1856 * and try to free up all the space we can.1857 */1858
1859 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1860 {1861 if (!skb->used || skb->users)
1862 break;
1863 skb_unlink(skb);
1864 skb->sk = sk;
1865 kfree_skb(skb, FREE_READ);
1866 }1867
1868 restore_flags(flags);
1869
1870 /*1871 * FIXME:1872 * At this point we should send an ack if the difference1873 * in the window, and the amount of space is bigger than1874 * TCP_WINDOW_DIFF.1875 */1876
1877 if(sk->debug)
1878 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1879 left);
1880 if ((rspace=sk->prot->rspace(sk)) != left)
1881 {1882 /*1883 * This area has caused the most trouble. The current strategy1884 * is to simply do nothing if the other end has room to send at1885 * least 3 full packets, because the ack from those will auto-1886 * matically update the window. If the other end doesn't think1887 * we have much space left, but we have room for at least 1 more1888 * complete packet than it thinks we do, we will send an ack1889 * immediately. Otherwise we will wait up to .5 seconds in case1890 * the user reads some more.1891 */1892 sk->ack_backlog++;
1893 /*1894 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1895 * if the other end is offering a window smaller than the agreed on MSS1896 * (called sk->mtu here). In theory there's no connection between send1897 * and receive, and so no reason to think that they're going to send1898 * small packets. For the moment I'm using the hack of reducing the mss1899 * only on the send side, so I'm putting mtu here.1900 */1901
1902 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1903 {1904 /* Send an ack right now. */1905 tcp_read_wakeup(sk);
1906 }1907 else1908 {1909 /* Force it to send an ack soon. */1910 intwas_active = del_timer(&sk->retransmit_timer);
1911 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1912 {1913 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1914 }1915 else1916 add_timer(&sk->retransmit_timer);
1917 }1918 }1919 }1920
1921
1922 /*1923 * Handle reading urgent data. BSD has very simple semantics for1924 * this, no blocking and very strange errors 8)1925 */1926
1927 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */1928 unsignedchar *to, intlen, unsignedflags)
1929 {1930 /*1931 * No URG data to read1932 */1933 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1934 return -EINVAL; /* Yes this is right ! */1935
1936 if (sk->err)
1937 {1938 inttmp = -sk->err;
1939 sk->err = 0;
1940 returntmp;
1941 }1942
1943 if (sk->state == TCP_CLOSE || sk->done)
1944 {1945 if (!sk->done) {1946 sk->done = 1;
1947 return 0;
1948 }1949 return -ENOTCONN;
1950 }1951
1952 if (sk->shutdown & RCV_SHUTDOWN)
1953 {1954 sk->done = 1;
1955 return 0;
1956 }1957 sk->inuse = 1;
1958 if (sk->urg_data & URG_VALID)
1959 {1960 charc = sk->urg_data;
1961 if (!(flags & MSG_PEEK))
1962 sk->urg_data = URG_READ;
1963 put_fs_byte(c, to);
1964 release_sock(sk);
1965 return 1;
1966 }1967 release_sock(sk);
1968
1969 /*1970 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and1971 * the available implementations agree in this case:1972 * this call should never block, independent of the1973 * blocking state of the socket.1974 * Mike <pall@rz.uni-karlsruhe.de>1975 */1976 return -EAGAIN;
1977 }1978
1979
1980 /*1981 * This routine copies from a sock struct into the user buffer. 1982 */1983
1984 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */1985 intlen, intnonblock, unsignedflags)
1986 {1987 structwait_queuewait = {current, NULL};
1988 intcopied = 0;
1989 u32peek_seq;
1990 volatileu32 *seq; /* So gcc doesn't overoptimise */1991 unsignedlongused;
1992
1993 /* 1994 * This error should be checked. 1995 */1996
1997 if (sk->state == TCP_LISTEN)
1998 return -ENOTCONN;
1999
2000 /*2001 * Urgent data needs to be handled specially. 2002 */2003
2004 if (flags & MSG_OOB)
2005 returntcp_read_urg(sk, nonblock, to, len, flags);
2006
2007 /*2008 * Copying sequence to update. This is volatile to handle2009 * the multi-reader case neatly (memcpy_to/fromfs might be 2010 * inline and thus not flush cached variables otherwise).2011 */2012
2013 peek_seq = sk->copied_seq;
2014 seq = &sk->copied_seq;
2015 if (flags & MSG_PEEK)
2016 seq = &peek_seq;
2017
2018 add_wait_queue(sk->sleep, &wait);
2019 sk->inuse = 1;
2020 while (len > 0)
2021 {2022 structsk_buff * skb;
2023 u32offset;
2024
2025 /*2026 * Are we at urgent data? Stop if we have read anything.2027 */2028
2029 if (copied && sk->urg_data && sk->urg_seq == *seq)
2030 break;
2031
2032 /*2033 * Next get a buffer.2034 */2035
2036 current->state = TASK_INTERRUPTIBLE;
2037
2038 skb = skb_peek(&sk->receive_queue);
2039 do2040 {2041 if (!skb)
2042 break;
2043 if (before(*seq, skb->h.th->seq))
2044 break;
2045 offset = *seq - skb->h.th->seq;
2046 if (skb->h.th->syn)
2047 offset--;
2048 if (offset < skb->len)
2049 gotofound_ok_skb;
2050 if (skb->h.th->fin)
2051 gotofound_fin_ok;
2052 if (!(flags & MSG_PEEK))
2053 skb->used = 1;
2054 skb = skb->next;
2055 }2056 while (skb != (structsk_buff *)&sk->receive_queue);
2057
2058 if (copied)
2059 break;
2060
2061 if (sk->err)
2062 {2063 copied = -sk->err;
2064 sk->err = 0;
2065 break;
2066 }2067
2068 if (sk->state == TCP_CLOSE)
2069 {2070 if (!sk->done)
2071 {2072 sk->done = 1;
2073 break;
2074 }2075 copied = -ENOTCONN;
2076 break;
2077 }2078
2079 if (sk->shutdown & RCV_SHUTDOWN)
2080 {2081 sk->done = 1;
2082 break;
2083 }2084
2085 if (nonblock)
2086 {2087 copied = -EAGAIN;
2088 break;
2089 }2090
2091 cleanup_rbuf(sk);
2092 release_sock(sk);
2093 sk->socket->flags |= SO_WAITDATA;
2094 schedule();
2095 sk->socket->flags &= ~SO_WAITDATA;
2096 sk->inuse = 1;
2097
2098 if (current->signal & ~current->blocked)
2099 {2100 copied = -ERESTARTSYS;
2101 break;
2102 }2103 continue;
2104
2105 found_ok_skb:
2106 /*2107 * Lock the buffer. We can be fairly relaxed as2108 * an interrupt will never steal a buffer we are 2109 * using unless I've missed something serious in2110 * tcp_data.2111 */2112
2113 skb->users++;
2114
2115 /*2116 * Ok so how much can we use ? 2117 */2118
2119 used = skb->len - offset;
2120 if (len < used)
2121 used = len;
2122 /*2123 * Do we have urgent data here? 2124 */2125
2126 if (sk->urg_data)
2127 {2128 u32urg_offset = sk->urg_seq - *seq;
2129 if (urg_offset < used)
2130 {2131 if (!urg_offset)
2132 {2133 if (!sk->urginline)
2134 {2135 ++*seq;
2136 offset++;
2137 used--;
2138 }2139 }2140 else2141 used = urg_offset;
2142 }2143 }2144
2145 /*2146 * Copy it - We _MUST_ update *seq first so that we2147 * don't ever double read when we have dual readers2148 */2149
2150 *seq += used;
2151
2152 /*2153 * This memcpy_tofs can sleep. If it sleeps and we2154 * do a second read it relies on the skb->users to avoid2155 * a crash when cleanup_rbuf() gets called.2156 */2157
2158 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2159 skb->h.th->doff*4 + offset, used);
2160 copied += used;
2161 len -= used;
2162 to += used;
2163
2164 /*2165 * We now will not sleep again until we are finished2166 * with skb. Sorry if you are doing the SMP port2167 * but you'll just have to fix it neatly ;)2168 */2169
2170 skb->users --;
2171
2172 if (after(sk->copied_seq,sk->urg_seq))
2173 sk->urg_data = 0;
2174 if (used + offset < skb->len)
2175 continue;
2176
2177 /*2178 * Process the FIN.2179 */2180
2181 if (skb->h.th->fin)
2182 gotofound_fin_ok;
2183 if (flags & MSG_PEEK)
2184 continue;
2185 skb->used = 1;
2186 continue;
2187
2188 found_fin_ok:
2189 ++*seq;
2190 if (flags & MSG_PEEK)
2191 break;
2192
2193 /*2194 * All is done2195 */2196
2197 skb->used = 1;
2198 sk->shutdown |= RCV_SHUTDOWN;
2199 break;
2200
2201 }2202 remove_wait_queue(sk->sleep, &wait);
2203 current->state = TASK_RUNNING;
2204
2205 /* Clean up data we have read: This will do ACK frames */2206 cleanup_rbuf(sk);
2207 release_sock(sk);
2208 returncopied;
2209 }2210
2211 /*2212 * State processing on a close. This implements the state shift for2213 * sending our FIN frame. Note that we only send a FIN for some 2214 * states. A shutdown() may have already sent the FIN, or we may be2215 * closed.2216 */2217
2218 staticinttcp_close_state(structsock *sk, intdead)
/* */2219 {2220 intns=TCP_CLOSE;
2221 intsend_fin=0;
2222 switch(sk->state)
2223 {2224 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2225 break;
2226 caseTCP_SYN_RECV:
2227 caseTCP_ESTABLISHED: /* Closedown begin */2228 ns=TCP_FIN_WAIT1;
2229 send_fin=1;
2230 break;
2231 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2232 caseTCP_FIN_WAIT2:
2233 caseTCP_CLOSING:
2234 ns=sk->state;
2235 break;
2236 caseTCP_CLOSE:
2237 caseTCP_LISTEN:
2238 break;
2239 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2240 wait only for the ACK */2241 ns=TCP_LAST_ACK;
2242 send_fin=1;
2243 }2244
2245 tcp_set_state(sk,ns);
2246
2247 /*2248 * This is a (useful) BSD violating of the RFC. There is a2249 * problem with TCP as specified in that the other end could2250 * keep a socket open forever with no application left this end.2251 * We use a 3 minute timeout (about the same as BSD) then kill2252 * our end. If they send after that then tough - BUT: long enough2253 * that we won't make the old 4*rto = almost no time - whoops2254 * reset mistake.2255 */2256 if(dead && ns==TCP_FIN_WAIT2)
2257 {2258 inttimer_active=del_timer(&sk->timer);
2259 if(timer_active)
2260 add_timer(&sk->timer);
2261 else2262 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2263 }2264
2265 returnsend_fin;
2266 }2267
2268 /*2269 * Send a fin.2270 */2271
2272 staticvoidtcp_send_fin(structsock *sk)
/* */2273 {2274 structproto *prot =(structproto *)sk->prot;
2275 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2276 structtcphdr *t1;
2277 structsk_buff *buff;
2278 structdevice *dev=NULL;
2279 inttmp;
2280
2281 release_sock(sk); /* in case the malloc sleeps. */2282
2283 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2284 sk->inuse = 1;
2285
2286 if (buff == NULL)
2287 {2288 /* This is a disaster if it occurs */2289 printk("tcp_send_fin: Impossible malloc failure");
2290 return;
2291 }2292
2293 /*2294 * Administrivia2295 */2296
2297 buff->sk = sk;
2298 buff->localroute = sk->localroute;
2299
2300 /*2301 * Put in the IP header and routing stuff. 2302 */2303
2304 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2305 IPPROTO_TCP, sk->opt,
2306 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2307 if (tmp < 0)
2308 {2309 intt;
2310 /*2311 * Finish anyway, treat this as a send that got lost. 2312 * (Not good).2313 */2314
2315 buff->free = 1;
2316 prot->wfree(sk,buff);
2317 sk->write_seq++;
2318 t=del_timer(&sk->timer);
2319 if(t)
2320 add_timer(&sk->timer);
2321 else2322 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2323 return;
2324 }2325
2326 /*2327 * We ought to check if the end of the queue is a buffer and2328 * if so simply add the fin to that buffer, not send it ahead.2329 */2330
2331 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2332 buff->dev = dev;
2333 memcpy(t1, th, sizeof(*t1));
2334 t1->seq = ntohl(sk->write_seq);
2335 sk->write_seq++;
2336 buff->h.seq = sk->write_seq;
2337 t1->ack = 1;
2338 t1->ack_seq = ntohl(sk->acked_seq);
2339 t1->window = ntohs(sk->window=tcp_select_window(sk));
2340 t1->fin = 1;
2341 t1->rst = 0;
2342 t1->doff = sizeof(*t1)/4;
2343 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2344
2345 /*2346 * If there is data in the write queue, the fin must be appended to2347 * the write queue.2348 */2349
2350 if (skb_peek(&sk->write_queue) != NULL)
2351 {2352 buff->free = 0;
2353 if (buff->next != NULL)
2354 {2355 printk("tcp_send_fin: next != NULL\n");
2356 skb_unlink(buff);
2357 }2358 skb_queue_tail(&sk->write_queue, buff);
2359 }2360 else2361 {2362 sk->sent_seq = sk->write_seq;
2363 sk->prot->queue_xmit(sk, dev, buff, 0);
2364 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2365 }2366 }2367
2368 /*2369 * Shutdown the sending side of a connection. Much like close except2370 * that we don't receive shut down or set sk->dead=1.2371 */2372
2373 voidtcp_shutdown(structsock *sk, inthow)
/* */2374 {2375 /*2376 * We need to grab some memory, and put together a FIN,2377 * and then put it into the queue to be sent.2378 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2379 */2380
2381 if (!(how & SEND_SHUTDOWN))
2382 return;
2383
2384 /*2385 * If we've already sent a FIN, or it's a closed state2386 */2387
2388 if (sk->state == TCP_FIN_WAIT1 ||
2389 sk->state == TCP_FIN_WAIT2 ||
2390 sk->state == TCP_CLOSING ||
2391 sk->state == TCP_LAST_ACK ||
2392 sk->state == TCP_TIME_WAIT ||
2393 sk->state == TCP_CLOSE ||
2394 sk->state == TCP_LISTEN2395 )
2396 {2397 return;
2398 }2399 sk->inuse = 1;
2400
2401 /*2402 * flag that the sender has shutdown2403 */2404
2405 sk->shutdown |= SEND_SHUTDOWN;
2406
2407 /*2408 * Clear out any half completed packets. 2409 */2410
2411 if (sk->partial)
2412 tcp_send_partial(sk);
2413
2414 /*2415 * FIN if needed2416 */2417
2418 if(tcp_close_state(sk,0))
2419 tcp_send_fin(sk);
2420
2421 release_sock(sk);
2422 }2423
2424
2425 staticint2426 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2427 intto_len, intnonblock, unsignedflags,
2428 structsockaddr_in *addr, int *addr_len)
2429 {2430 intresult;
2431
2432 /* 2433 * Have to check these first unlike the old code. If 2434 * we check them after we lose data on an error2435 * which is wrong 2436 */2437
2438 if(addr_len)
2439 *addr_len = sizeof(*addr);
2440 result=tcp_read(sk, to, to_len, nonblock, flags);
2441
2442 if (result < 0)
2443 return(result);
2444
2445 if(addr)
2446 {2447 addr->sin_family = AF_INET;
2448 addr->sin_port = sk->dummy_th.dest;
2449 addr->sin_addr.s_addr = sk->daddr;
2450 }2451 return(result);
2452 }2453
2454
2455 /*2456 * This routine will send an RST to the other tcp. 2457 */2458
2459 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2460 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2461 {2462 structsk_buff *buff;
2463 structtcphdr *t1;
2464 inttmp;
2465 structdevice *ndev=NULL;
2466
2467 /*2468 * Cannot reset a reset (Think about it).2469 */2470
2471 if(th->rst)
2472 return;
2473
2474 /*2475 * We need to grab some memory, and put together an RST,2476 * and then put it into the queue to be sent.2477 */2478
2479 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2480 if (buff == NULL)
2481 return;
2482
2483 buff->sk = NULL;
2484 buff->dev = dev;
2485 buff->localroute = 0;
2486
2487 /*2488 * Put in the IP header and routing stuff. 2489 */2490
2491 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2492 sizeof(structtcphdr),tos,ttl);
2493 if (tmp < 0)
2494 {2495 buff->free = 1;
2496 prot->wfree(NULL, buff);
2497 return;
2498 }2499
2500 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2501 memcpy(t1, th, sizeof(*t1));
2502
2503 /*2504 * Swap the send and the receive. 2505 */2506
2507 t1->dest = th->source;
2508 t1->source = th->dest;
2509 t1->rst = 1;
2510 t1->window = 0;
2511
2512 if(th->ack)
2513 {2514 t1->ack = 0;
2515 t1->seq = th->ack_seq;
2516 t1->ack_seq = 0;
2517 }2518 else2519 {2520 t1->ack = 1;
2521 if(!th->syn)
2522 t1->ack_seq=htonl(th->seq);
2523 else2524 t1->ack_seq=htonl(th->seq+1);
2525 t1->seq=0;
2526 }2527
2528 t1->syn = 0;
2529 t1->urg = 0;
2530 t1->fin = 0;
2531 t1->psh = 0;
2532 t1->doff = sizeof(*t1)/4;
2533 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2534 prot->queue_xmit(NULL, ndev, buff, 1);
2535 tcp_statistics.TcpOutSegs++;
2536 }2537
2538
2539 /*2540 * Look for tcp options. Parses everything but only knows about MSS.2541 * This routine is always called with the packet containing the SYN.2542 * However it may also be called with the ack to the SYN. So you2543 * can't assume this is always the SYN. It's always called after2544 * we have set up sk->mtu to our own MTU.2545 *2546 * We need at minimum to add PAWS support here. Possibly large windows2547 * as Linux gets deployed on 100Mb/sec networks.2548 */2549
2550 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2551 {2552 unsignedchar *ptr;
2553 intlength=(th->doff*4)-sizeof(structtcphdr);
2554 intmss_seen = 0;
2555
2556 ptr = (unsignedchar *)(th + 1);
2557
2558 while(length>0)
2559 {2560 intopcode=*ptr++;
2561 intopsize=*ptr++;
2562 switch(opcode)
2563 {2564 caseTCPOPT_EOL:
2565 return;
2566 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2567 length--;
2568 ptr--; /* the opsize=*ptr++ above was a mistake */2569 continue;
2570
2571 default:
2572 if(opsize<=2) /* Avoid silly options looping forever */2573 return;
2574 switch(opcode)
2575 {2576 caseTCPOPT_MSS:
2577 if(opsize==4 && th->syn)
2578 {2579 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2580 mss_seen = 1;
2581 }2582 break;
2583 /* Add other options here as people feel the urge to implement stuff like large windows */2584 }2585 ptr+=opsize-2;
2586 length-=opsize;
2587 }2588 }2589 if (th->syn)
2590 {2591 if (! mss_seen)
2592 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2593 }2594 #ifdefCONFIG_INET_PCTCP2595 sk->mss = min(sk->max_window >> 1, sk->mtu);
2596 #else2597 sk->mss = min(sk->max_window, sk->mtu);
2598 #endif2599 }2600
2601 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2602 {2603 dst = ntohl(dst);
2604 if (IN_CLASSA(dst))
2605 returnhtonl(IN_CLASSA_NET);
2606 if (IN_CLASSB(dst))
2607 returnhtonl(IN_CLASSB_NET);
2608 returnhtonl(IN_CLASSC_NET);
2609 }2610
2611 /*2612 * Default sequence number picking algorithm.2613 * As close as possible to RFC 793, which2614 * suggests using a 250kHz clock.2615 * Further reading shows this assumes 2MB/s networks.2616 * For 10MB/s ethernet, a 1MHz clock is appropriate.2617 * That's funny, Linux has one built in! Use it!2618 */2619
2620 externinlineu32tcp_init_seq(void)
/* */2621 {2622 structtimevaltv;
2623 do_gettimeofday(&tv);
2624 returntv.tv_usec+tv.tv_sec*1000000;
2625 }2626
2627 /*2628 * This routine handles a connection request.2629 * It should make sure we haven't already responded.2630 * Because of the way BSD works, we have to send a syn/ack now.2631 * This also means it will be harder to close a socket which is2632 * listening.2633 */2634
2635 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2636 unsignedlongdaddr, unsignedlongsaddr,
2637 structoptions *opt, structdevice *dev, u32seq)
2638 {2639 structsk_buff *buff;
2640 structtcphdr *t1;
2641 unsignedchar *ptr;
2642 structsock *newsk;
2643 structtcphdr *th;
2644 structdevice *ndev=NULL;
2645 inttmp;
2646 structrtable *rt;
2647
2648 th = skb->h.th;
2649
2650 /* If the socket is dead, don't accept the connection. */2651 if (!sk->dead)
2652 {2653 sk->data_ready(sk,0);
2654 }2655 else2656 {2657 if(sk->debug)
2658 printk("Reset on %p: Connect on dead socket.\n",sk);
2659 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2660 tcp_statistics.TcpAttemptFails++;
2661 kfree_skb(skb, FREE_READ);
2662 return;
2663 }2664
2665 /*2666 * Make sure we can accept more. This will prevent a2667 * flurry of syns from eating up all our memory.2668 */2669
2670 if (sk->ack_backlog >= sk->max_ack_backlog)
2671 {2672 tcp_statistics.TcpAttemptFails++;
2673 kfree_skb(skb, FREE_READ);
2674 return;
2675 }2676
2677 /*2678 * We need to build a new sock struct.2679 * It is sort of bad to have a socket without an inode attached2680 * to it, but the wake_up's will just wake up the listening socket,2681 * and if the listening socket is destroyed before this is taken2682 * off of the queue, this will take care of it.2683 */2684
2685 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2686 if (newsk == NULL)
2687 {2688 /* just ignore the syn. It will get retransmitted. */2689 tcp_statistics.TcpAttemptFails++;
2690 kfree_skb(skb, FREE_READ);
2691 return;
2692 }2693
2694 memcpy(newsk, sk, sizeof(*newsk));
2695 skb_queue_head_init(&newsk->write_queue);
2696 skb_queue_head_init(&newsk->receive_queue);
2697 newsk->send_head = NULL;
2698 newsk->send_tail = NULL;
2699 skb_queue_head_init(&newsk->back_log);
2700 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2701 newsk->rto = TCP_TIMEOUT_INIT;
2702 newsk->mdev = 0;
2703 newsk->max_window = 0;
2704 newsk->cong_window = 1;
2705 newsk->cong_count = 0;
2706 newsk->ssthresh = 0;
2707 newsk->backoff = 0;
2708 newsk->blog = 0;
2709 newsk->intr = 0;
2710 newsk->proc = 0;
2711 newsk->done = 0;
2712 newsk->partial = NULL;
2713 newsk->pair = NULL;
2714 newsk->wmem_alloc = 0;
2715 newsk->rmem_alloc = 0;
2716 newsk->localroute = sk->localroute;
2717
2718 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2719
2720 newsk->err = 0;
2721 newsk->shutdown = 0;
2722 newsk->ack_backlog = 0;
2723 newsk->acked_seq = skb->h.th->seq+1;
2724 newsk->copied_seq = skb->h.th->seq+1;
2725 newsk->fin_seq = skb->h.th->seq;
2726 newsk->state = TCP_SYN_RECV;
2727 newsk->timeout = 0;
2728 newsk->ip_xmit_timeout = 0;
2729 newsk->write_seq = seq;
2730 newsk->window_seq = newsk->write_seq;
2731 newsk->rcv_ack_seq = newsk->write_seq;
2732 newsk->urg_data = 0;
2733 newsk->retransmits = 0;
2734 newsk->linger=0;
2735 newsk->destroy = 0;
2736 init_timer(&newsk->timer);
2737 newsk->timer.data = (unsignedlong)newsk;
2738 newsk->timer.function = &net_timer;
2739 init_timer(&newsk->retransmit_timer);
2740 newsk->retransmit_timer.data = (unsignedlong)newsk;
2741 newsk->retransmit_timer.function=&retransmit_timer;
2742 newsk->dummy_th.source = skb->h.th->dest;
2743 newsk->dummy_th.dest = skb->h.th->source;
2744
2745 /*2746 * Swap these two, they are from our point of view. 2747 */2748
2749 newsk->daddr = saddr;
2750 newsk->saddr = daddr;
2751
2752 put_sock(newsk->num,newsk);
2753 newsk->dummy_th.res1 = 0;
2754 newsk->dummy_th.doff = 6;
2755 newsk->dummy_th.fin = 0;
2756 newsk->dummy_th.syn = 0;
2757 newsk->dummy_th.rst = 0;
2758 newsk->dummy_th.psh = 0;
2759 newsk->dummy_th.ack = 0;
2760 newsk->dummy_th.urg = 0;
2761 newsk->dummy_th.res2 = 0;
2762 newsk->acked_seq = skb->h.th->seq + 1;
2763 newsk->copied_seq = skb->h.th->seq + 1;
2764 newsk->socket = NULL;
2765
2766 /*2767 * Grab the ttl and tos values and use them 2768 */2769
2770 newsk->ip_ttl=sk->ip_ttl;
2771 newsk->ip_tos=skb->ip_hdr->tos;
2772
2773 /*2774 * Use 512 or whatever user asked for 2775 */2776
2777 /*2778 * Note use of sk->user_mss, since user has no direct access to newsk 2779 */2780
2781 rt=ip_rt_route(saddr, NULL,NULL);
2782
2783 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2784 newsk->window_clamp = rt->rt_window;
2785 else2786 newsk->window_clamp = 0;
2787
2788 if (sk->user_mss)
2789 newsk->mtu = sk->user_mss;
2790 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
2791 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2792 else2793 {2794 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */2795 if ((saddr ^ daddr) & default_mask(saddr))
2796 #else2797 if ((saddr ^ daddr) & dev->pa_mask)
2798 #endif2799 newsk->mtu = 576 - HEADER_SIZE;
2800 else2801 newsk->mtu = MAX_WINDOW;
2802 }2803
2804 /*2805 * But not bigger than device MTU 2806 */2807
2808 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2809
2810 /*2811 * This will min with what arrived in the packet 2812 */2813
2814 tcp_options(newsk,skb->h.th);
2815
2816 tcp_cache_zap();
2817
2818 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2819 if (buff == NULL)
2820 {2821 sk->err = ENOMEM;
2822 newsk->dead = 1;
2823 newsk->state = TCP_CLOSE;
2824 /* And this will destroy it */2825 release_sock(newsk);
2826 kfree_skb(skb, FREE_READ);
2827 tcp_statistics.TcpAttemptFails++;
2828 return;
2829 }2830
2831 buff->sk = newsk;
2832 buff->localroute = newsk->localroute;
2833
2834 /*2835 * Put in the IP header and routing stuff. 2836 */2837
2838 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2839 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2840
2841 /*2842 * Something went wrong. 2843 */2844
2845 if (tmp < 0)
2846 {2847 sk->err = tmp;
2848 buff->free = 1;
2849 kfree_skb(buff,FREE_WRITE);
2850 newsk->dead = 1;
2851 newsk->state = TCP_CLOSE;
2852 release_sock(newsk);
2853 skb->sk = sk;
2854 kfree_skb(skb, FREE_READ);
2855 tcp_statistics.TcpAttemptFails++;
2856 return;
2857 }2858
2859 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2860
2861 memcpy(t1, skb->h.th, sizeof(*t1));
2862 buff->h.seq = newsk->write_seq;
2863 /*2864 * Swap the send and the receive. 2865 */2866 t1->dest = skb->h.th->source;
2867 t1->source = newsk->dummy_th.source;
2868 t1->seq = ntohl(newsk->write_seq++);
2869 t1->ack = 1;
2870 newsk->window = tcp_select_window(newsk);
2871 newsk->sent_seq = newsk->write_seq;
2872 t1->window = ntohs(newsk->window);
2873 t1->res1 = 0;
2874 t1->res2 = 0;
2875 t1->rst = 0;
2876 t1->urg = 0;
2877 t1->psh = 0;
2878 t1->syn = 1;
2879 t1->ack_seq = ntohl(skb->h.th->seq+1);
2880 t1->doff = sizeof(*t1)/4+1;
2881 ptr = skb_put(buff,4);
2882 ptr[0] = 2;
2883 ptr[1] = 4;
2884 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2885 ptr[3] =(newsk->mtu) & 0xff;
2886
2887 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2888 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2889 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2890 skb->sk = newsk;
2891
2892 /*2893 * Charge the sock_buff to newsk. 2894 */2895
2896 sk->rmem_alloc -= skb->truesize;
2897 newsk->rmem_alloc += skb->truesize;
2898
2899 skb_queue_tail(&sk->receive_queue,skb);
2900 sk->ack_backlog++;
2901 release_sock(newsk);
2902 tcp_statistics.TcpOutSegs++;
2903 }2904
2905
2906 staticvoidtcp_close(structsock *sk, inttimeout)
/* */2907 {2908 /*2909 * We need to grab some memory, and put together a FIN, 2910 * and then put it into the queue to be sent.2911 */2912
2913 sk->inuse = 1;
2914
2915 if(th_cache_sk==sk)
2916 tcp_cache_zap();
2917 if(sk->state == TCP_LISTEN)
2918 {2919 /* Special case */2920 tcp_set_state(sk, TCP_CLOSE);
2921 tcp_close_pending(sk);
2922 release_sock(sk);
2923 return;
2924 }2925
2926 sk->keepopen = 1;
2927 sk->shutdown = SHUTDOWN_MASK;
2928
2929 if (!sk->dead)
2930 sk->state_change(sk);
2931
2932 if (timeout == 0)
2933 {2934 structsk_buff *skb;
2935
2936 /*2937 * We need to flush the recv. buffs. We do this only on the2938 * descriptor close, not protocol-sourced closes, because the2939 * reader process may not have drained the data yet!2940 */2941
2942 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2943 kfree_skb(skb, FREE_READ);
2944 /*2945 * Get rid off any half-completed packets. 2946 */2947
2948 if (sk->partial)
2949 tcp_send_partial(sk);
2950 }2951
2952
2953 /*2954 * Timeout is not the same thing - however the code likes2955 * to send both the same way (sigh).2956 */2957
2958 if(timeout)
2959 {2960 tcp_set_state(sk, TCP_CLOSE); /* Dead */2961 }2962 else2963 {2964 if(tcp_close_state(sk,1)==1)
2965 {2966 tcp_send_fin(sk);
2967 }2968 }2969 release_sock(sk);
2970 }2971
2972
2973 /*2974 * This routine takes stuff off of the write queue,2975 * and puts it in the xmit queue. This happens as incoming acks2976 * open up the remote window for us.2977 */2978
2979 staticvoidtcp_write_xmit(structsock *sk)
/* */2980 {2981 structsk_buff *skb;
2982
2983 /*2984 * The bytes will have to remain here. In time closedown will2985 * empty the write queue and all will be happy 2986 */2987
2988 if(sk->zapped)
2989 return;
2990
2991 /*2992 * Anything on the transmit queue that fits the window can2993 * be added providing we are not2994 *2995 * a) retransmitting (Nagle's rule)2996 * b) exceeding our congestion window.2997 */2998
2999 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3000 before(skb->h.seq, sk->window_seq + 1) &&
3001 (sk->retransmits == 0 ||
3002 sk->ip_xmit_timeout != TIME_WRITE ||
3003 before(skb->h.seq, sk->rcv_ack_seq + 1))
3004 && sk->packets_out < sk->cong_window)
3005 {3006 IS_SKB(skb);
3007 skb_unlink(skb);
3008
3009 /*3010 * See if we really need to send the packet. 3011 */3012
3013 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3014 {3015 /*3016 * This is acked data. We can discard it. This 3017 * cannot currently occur.3018 */3019
3020 sk->retransmits = 0;
3021 kfree_skb(skb, FREE_WRITE);
3022 if (!sk->dead)
3023 sk->write_space(sk);
3024 }3025 else3026 {3027 structtcphdr *th;
3028 structiphdr *iph;
3029 intsize;
3030 /*3031 * put in the ack seq and window at this point rather than earlier,3032 * in order to keep them monotonic. We really want to avoid taking3033 * back window allocations. That's legal, but RFC1122 says it's frowned on.3034 * Ack and window will in general have changed since this packet was put3035 * on the write queue.3036 */3037 iph = (structiphdr *)(skb->data +
3038 skb->dev->hard_header_len);
3039 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3040 size = skb->len - (((unsignedchar *) th) - skb->data);
3041
3042 th->ack_seq = ntohl(sk->acked_seq);
3043 th->window = ntohs(tcp_select_window(sk));
3044
3045 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3046
3047 sk->sent_seq = skb->h.seq;
3048
3049 /*3050 * IP manages our queue for some crazy reason3051 */3052
3053 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3054
3055 /*3056 * Again we slide the timer wrongly3057 */3058
3059 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3060 }3061 }3062 }3063
3064
3065 /*3066 * This routine deals with incoming acks, but not outgoing ones.3067 */3068
3069 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3070 {3071 u32ack;
3072 intflag = 0;
3073
3074 /* 3075 * 1 - there was data in packet as well as ack or new data is sent or 3076 * in shutdown state3077 * 2 - data from retransmit queue was acked and removed3078 * 4 - window shrunk or data from retransmit queue was acked and removed3079 */3080
3081 if(sk->zapped)
3082 return(1); /* Dead, cant ack any more so why bother */3083
3084 /*3085 * Have we discovered a larger window3086 */3087
3088 ack = ntohl(th->ack_seq);
3089
3090 if (ntohs(th->window) > sk->max_window)
3091 {3092 sk->max_window = ntohs(th->window);
3093 #ifdefCONFIG_INET_PCTCP3094 /* Hack because we don't send partial packets to non SWS3095 handling hosts */3096 sk->mss = min(sk->max_window>>1, sk->mtu);
3097 #else3098 sk->mss = min(sk->max_window, sk->mtu);
3099 #endif3100 }3101
3102 /*3103 * We have dropped back to keepalive timeouts. Thus we have3104 * no retransmits pending.3105 */3106
3107 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3108 sk->retransmits = 0;
3109
3110 /*3111 * If the ack is newer than sent or older than previous acks3112 * then we can probably ignore it.3113 */3114
3115 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3116 {3117 if(sk->debug)
3118 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3119
3120 /*3121 * Keepalive processing.3122 */3123
3124 if (after(ack, sk->sent_seq))
3125 {3126 return(0);
3127 }3128
3129 /*3130 * Restart the keepalive timer.3131 */3132
3133 if (sk->keepopen)
3134 {3135 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3136 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3137 }3138 return(1);
3139 }3140
3141 /*3142 * If there is data set flag 13143 */3144
3145 if (len != th->doff*4)
3146 flag |= 1;
3147
3148 /*3149 * See if our window has been shrunk. 3150 */3151
3152 if (after(sk->window_seq, ack+ntohs(th->window)))
3153 {3154 /*3155 * We may need to move packets from the send queue3156 * to the write queue, if the window has been shrunk on us.3157 * The RFC says you are not allowed to shrink your window3158 * like this, but if the other end does, you must be able3159 * to deal with it.3160 */3161 structsk_buff *skb;
3162 structsk_buff *skb2;
3163 structsk_buff *wskb = NULL;
3164
3165 skb2 = sk->send_head;
3166 sk->send_head = NULL;
3167 sk->send_tail = NULL;
3168
3169 /*3170 * This is an artifact of a flawed concept. We want one3171 * queue and a smarter send routine when we send all.3172 */3173
3174 flag |= 4; /* Window changed */3175
3176 sk->window_seq = ack + ntohs(th->window);
3177 cli();
3178 while (skb2 != NULL)
3179 {3180 skb = skb2;
3181 skb2 = skb->link3;
3182 skb->link3 = NULL;
3183 if (after(skb->h.seq, sk->window_seq))
3184 {3185 if (sk->packets_out > 0)
3186 sk->packets_out--;
3187 /* We may need to remove this from the dev send list. */3188 if (skb->next != NULL)
3189 {3190 skb_unlink(skb);
3191 }3192 /* Now add it to the write_queue. */3193 if (wskb == NULL)
3194 skb_queue_head(&sk->write_queue,skb);
3195 else3196 skb_append(wskb,skb);
3197 wskb = skb;
3198 }3199 else3200 {3201 if (sk->send_head == NULL)
3202 {3203 sk->send_head = skb;
3204 sk->send_tail = skb;
3205 }3206 else3207 {3208 sk->send_tail->link3 = skb;
3209 sk->send_tail = skb;
3210 }3211 skb->link3 = NULL;
3212 }3213 }3214 sti();
3215 }3216
3217 /*3218 * Pipe has emptied3219 */3220
3221 if (sk->send_tail == NULL || sk->send_head == NULL)
3222 {3223 sk->send_head = NULL;
3224 sk->send_tail = NULL;
3225 sk->packets_out= 0;
3226 }3227
3228 /*3229 * Update the right hand window edge of the host3230 */3231
3232 sk->window_seq = ack + ntohs(th->window);
3233
3234 /*3235 * We don't want too many packets out there. 3236 */3237
3238 if (sk->ip_xmit_timeout == TIME_WRITE &&
3239 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3240 {3241 /* 3242 * This is Jacobson's slow start and congestion avoidance. 3243 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3244 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3245 * counter and increment it once every cwnd times. It's possible3246 * that this should be done only if sk->retransmits == 0. I'm3247 * interpreting "new data is acked" as including data that has3248 * been retransmitted but is just now being acked.3249 */3250 if (sk->cong_window < sk->ssthresh)
3251 /* 3252 * In "safe" area, increase3253 */3254 sk->cong_window++;
3255 else3256 {3257 /*3258 * In dangerous area, increase slowly. In theory this is3259 * sk->cong_window += 1 / sk->cong_window3260 */3261 if (sk->cong_count >= sk->cong_window)
3262 {3263 sk->cong_window++;
3264 sk->cong_count = 0;
3265 }3266 else3267 sk->cong_count++;
3268 }3269 }3270
3271 /*3272 * Remember the highest ack received.3273 */3274
3275 sk->rcv_ack_seq = ack;
3276
3277 /*3278 * If this ack opens up a zero window, clear backoff. It was3279 * being used to time the probes, and is probably far higher than3280 * it needs to be for normal retransmission.3281 */3282
3283 if (sk->ip_xmit_timeout == TIME_PROBE0)
3284 {3285 sk->retransmits = 0; /* Our probe was answered */3286
3287 /*3288 * Was it a usable window open ?3289 */3290
3291 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3292 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3293 {3294 sk->backoff = 0;
3295
3296 /*3297 * Recompute rto from rtt. this eliminates any backoff.3298 */3299
3300 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3301 if (sk->rto > 120*HZ)
3302 sk->rto = 120*HZ;
3303 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3304 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3305 .2 of a second is going to need huge windows (SIGH) */3306 sk->rto = 20;
3307 }3308 }3309
3310 /* 3311 * See if we can take anything off of the retransmit queue.3312 */3313
3314 while(sk->send_head != NULL)
3315 {3316 /* Check for a bug. */3317 if (sk->send_head->link3 &&
3318 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3319 printk("INET: tcp.c: *** bug send_list out of order.\n");
3320
3321 /*3322 * If our packet is before the ack sequence we can3323 * discard it as it's confirmed to have arrived the other end.3324 */3325
3326 if (before(sk->send_head->h.seq, ack+1))
3327 {3328 structsk_buff *oskb;
3329 if (sk->retransmits)
3330 {3331 /*3332 * We were retransmitting. don't count this in RTT est 3333 */3334 flag |= 2;
3335
3336 /*3337 * even though we've gotten an ack, we're still3338 * retransmitting as long as we're sending from3339 * the retransmit queue. Keeping retransmits non-zero3340 * prevents us from getting new data interspersed with3341 * retransmissions.3342 */3343
3344 if (sk->send_head->link3) /* Any more queued retransmits? */3345 sk->retransmits = 1;
3346 else3347 sk->retransmits = 0;
3348 }3349 /*3350 * Note that we only reset backoff and rto in the3351 * rtt recomputation code. And that doesn't happen3352 * if there were retransmissions in effect. So the3353 * first new packet after the retransmissions is3354 * sent with the backoff still in effect. Not until3355 * we get an ack from a non-retransmitted packet do3356 * we reset the backoff and rto. This allows us to deal3357 * with a situation where the network delay has increased3358 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3359 */3360
3361 /*3362 * We have one less packet out there. 3363 */3364
3365 if (sk->packets_out > 0)
3366 sk->packets_out --;
3367 /* 3368 * Wake up the process, it can probably write more. 3369 */3370 if (!sk->dead)
3371 sk->write_space(sk);
3372 oskb = sk->send_head;
3373
3374 if (!(flag&2)) /* Not retransmitting */3375 {3376 longm;
3377
3378 /*3379 * The following amusing code comes from Jacobson's3380 * article in SIGCOMM '88. Note that rtt and mdev3381 * are scaled versions of rtt and mean deviation.3382 * This is designed to be as fast as possible 3383 * m stands for "measurement".3384 */3385
3386 m = jiffies - oskb->when; /* RTT */3387 if(m<=0)
3388 m=1; /* IS THIS RIGHT FOR <0 ??? */3389 m -= (sk->rtt >> 3); /* m is now error in rtt est */3390 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3391 if (m < 0)
3392 m = -m; /* m is now abs(error) */3393 m -= (sk->mdev >> 2); /* similar update on mdev */3394 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3395
3396 /*3397 * Now update timeout. Note that this removes any backoff.3398 */3399
3400 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3401 if (sk->rto > 120*HZ)
3402 sk->rto = 120*HZ;
3403 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3404 sk->rto = 20;
3405 sk->backoff = 0;
3406 }3407 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3408 In this case as we just set it up */3409 cli();
3410 oskb = sk->send_head;
3411 IS_SKB(oskb);
3412 sk->send_head = oskb->link3;
3413 if (sk->send_head == NULL)
3414 {3415 sk->send_tail = NULL;
3416 }3417
3418 /*3419 * We may need to remove this from the dev send list. 3420 */3421
3422 if (oskb->next)
3423 skb_unlink(oskb);
3424 sti();
3425 kfree_skb(oskb, FREE_WRITE); /* write. */3426 if (!sk->dead)
3427 sk->write_space(sk);
3428 }3429 else3430 {3431 break;
3432 }3433 }3434
3435 /*3436 * XXX someone ought to look at this too.. at the moment, if skb_peek()3437 * returns non-NULL, we complete ignore the timer stuff in the else3438 * clause. We ought to organize the code so that else clause can3439 * (should) be executed regardless, possibly moving the PROBE timer3440 * reset over. The skb_peek() thing should only move stuff to the3441 * write queue, NOT also manage the timer functions.3442 */3443
3444 /*3445 * Maybe we can take some stuff off of the write queue,3446 * and put it onto the xmit queue.3447 */3448 if (skb_peek(&sk->write_queue) != NULL)
3449 {3450 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3451 (sk->retransmits == 0 ||
3452 sk->ip_xmit_timeout != TIME_WRITE ||
3453 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3454 && sk->packets_out < sk->cong_window)
3455 {3456 /*3457 * Add more data to the send queue.3458 */3459 flag |= 1;
3460 tcp_write_xmit(sk);
3461 }3462 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3463 sk->send_head == NULL &&
3464 sk->ack_backlog == 0 &&
3465 sk->state != TCP_TIME_WAIT)
3466 {3467 /*3468 * Data to queue but no room.3469 */3470 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3471 }3472 }3473 else3474 {3475 /*3476 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3477 * from TCP_CLOSE we don't do anything3478 *3479 * from anything else, if there is write data (or fin) pending,3480 * we use a TIME_WRITE timeout, else if keepalive we reset to3481 * a KEEPALIVE timeout, else we delete the timer.3482 *3483 * We do not set flag for nominal write data, otherwise we may3484 * force a state where we start to write itsy bitsy tidbits3485 * of data.3486 */3487
3488 switch(sk->state) {3489 caseTCP_TIME_WAIT:
3490 /*3491 * keep us in TIME_WAIT until we stop getting packets,3492 * reset the timeout.3493 */3494 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3495 break;
3496 caseTCP_CLOSE:
3497 /*3498 * don't touch the timer.3499 */3500 break;
3501 default:
3502 /*3503 * Must check send_head, write_queue, and ack_backlog3504 * to determine which timeout to use.3505 */3506 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3507 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3508 }elseif (sk->keepopen) {3509 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3510 }else{3511 del_timer(&sk->retransmit_timer);
3512 sk->ip_xmit_timeout = 0;
3513 }3514 break;
3515 }3516 }3517
3518 /*3519 * We have nothing queued but space to send. Send any partial3520 * packets immediately (end of Nagle rule application).3521 */3522
3523 if (sk->packets_out == 0 && sk->partial != NULL &&
3524 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3525 {3526 flag |= 1;
3527 tcp_send_partial(sk);
3528 }3529
3530 /*3531 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3532 * we are now waiting for an acknowledge to our FIN. The other end is3533 * already in TIME_WAIT.3534 *3535 * Move to TCP_CLOSE on success.3536 */3537
3538 if (sk->state == TCP_LAST_ACK)
3539 {3540 if (!sk->dead)
3541 sk->state_change(sk);
3542 if(sk->debug)
3543 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3544 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3545 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3546 {3547 flag |= 1;
3548 tcp_set_state(sk,TCP_CLOSE);
3549 sk->shutdown = SHUTDOWN_MASK;
3550 }3551 }3552
3553 /*3554 * Incoming ACK to a FIN we sent in the case of our initiating the close.3555 *3556 * Move to FIN_WAIT2 to await a FIN from the other end. Set3557 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3558 */3559
3560 if (sk->state == TCP_FIN_WAIT1)
3561 {3562
3563 if (!sk->dead)
3564 sk->state_change(sk);
3565 if (sk->rcv_ack_seq == sk->write_seq)
3566 {3567 flag |= 1;
3568 sk->shutdown |= SEND_SHUTDOWN;
3569 tcp_set_state(sk, TCP_FIN_WAIT2);
3570 }3571 }3572
3573 /*3574 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3575 *3576 * Move to TIME_WAIT3577 */3578
3579 if (sk->state == TCP_CLOSING)
3580 {3581
3582 if (!sk->dead)
3583 sk->state_change(sk);
3584 if (sk->rcv_ack_seq == sk->write_seq)
3585 {3586 flag |= 1;
3587 tcp_time_wait(sk);
3588 }3589 }3590
3591 /*3592 * Final ack of a three way shake 3593 */3594
3595 if(sk->state==TCP_SYN_RECV)
3596 {3597 tcp_set_state(sk, TCP_ESTABLISHED);
3598 tcp_options(sk,th);
3599 sk->dummy_th.dest=th->source;
3600 sk->copied_seq = sk->acked_seq;
3601 if(!sk->dead)
3602 sk->state_change(sk);
3603 if(sk->max_window==0)
3604 {3605 sk->max_window=32; /* Sanity check */3606 sk->mss=min(sk->max_window,sk->mtu);
3607 }3608 }3609
3610 /*3611 * I make no guarantees about the first clause in the following3612 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3613 * what conditions "!flag" would be true. However I think the rest3614 * of the conditions would prevent that from causing any3615 * unnecessary retransmission. 3616 * Clearly if the first packet has expired it should be 3617 * retransmitted. The other alternative, "flag&2 && retransmits", is3618 * harder to explain: You have to look carefully at how and when the3619 * timer is set and with what timeout. The most recent transmission always3620 * sets the timer. So in general if the most recent thing has timed3621 * out, everything before it has as well. So we want to go ahead and3622 * retransmit some more. If we didn't explicitly test for this3623 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3624 * would not be true. If you look at the pattern of timing, you can3625 * show that rto is increased fast enough that the next packet would3626 * almost never be retransmitted immediately. Then you'd end up3627 * waiting for a timeout to send each packet on the retransmission3628 * queue. With my implementation of the Karn sampling algorithm,3629 * the timeout would double each time. The net result is that it would3630 * take a hideous amount of time to recover from a single dropped packet.3631 * It's possible that there should also be a test for TIME_WRITE, but3632 * I think as long as "send_head != NULL" and "retransmit" is on, we've3633 * got to be in real retransmission mode.3634 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3635 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3636 * As long as no further losses occur, this seems reasonable.3637 */3638
3639 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3640 (((flag&2) && sk->retransmits) ||
3641 (sk->send_head->when + sk->rto < jiffies)))
3642 {3643 if(sk->send_head->when + sk->rto < jiffies)
3644 tcp_retransmit(sk,0);
3645 else3646 {3647 tcp_do_retransmit(sk, 1);
3648 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3649 }3650 }3651
3652 return(1);
3653 }3654
3655
3656 /*3657 * Process the FIN bit. This now behaves as it is supposed to work3658 * and the FIN takes effect when it is validly part of sequence3659 * space. Not before when we get holes.3660 *3661 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3662 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3663 * TIME-WAIT)3664 *3665 * If we are in FINWAIT-1, a received FIN indicates simultaneous3666 * close and we go into CLOSING (and later onto TIME-WAIT)3667 *3668 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3669 *3670 */3671
3672 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3673 {3674 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3675
3676 if (!sk->dead)
3677 {3678 sk->state_change(sk);
3679 sock_wake_async(sk->socket, 1);
3680 }3681
3682 switch(sk->state)
3683 {3684 caseTCP_SYN_RECV:
3685 caseTCP_SYN_SENT:
3686 caseTCP_ESTABLISHED:
3687 /*3688 * move to CLOSE_WAIT, tcp_data() already handled3689 * sending the ack.3690 */3691 tcp_set_state(sk,TCP_CLOSE_WAIT);
3692 if (th->rst)
3693 sk->shutdown = SHUTDOWN_MASK;
3694 break;
3695
3696 caseTCP_CLOSE_WAIT:
3697 caseTCP_CLOSING:
3698 /*3699 * received a retransmission of the FIN, do3700 * nothing.3701 */3702 break;
3703 caseTCP_TIME_WAIT:
3704 /*3705 * received a retransmission of the FIN,3706 * restart the TIME_WAIT timer.3707 */3708 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3709 return(0);
3710 caseTCP_FIN_WAIT1:
3711 /*3712 * This case occurs when a simultaneous close3713 * happens, we must ack the received FIN and3714 * enter the CLOSING state.3715 *3716 * This causes a WRITE timeout, which will either3717 * move on to TIME_WAIT when we timeout, or resend3718 * the FIN properly (maybe we get rid of that annoying3719 * FIN lost hang). The TIME_WRITE code is already correct3720 * for handling this timeout.3721 */3722
3723 if(sk->ip_xmit_timeout != TIME_WRITE)
3724 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3725 tcp_set_state(sk,TCP_CLOSING);
3726 break;
3727 caseTCP_FIN_WAIT2:
3728 /*3729 * received a FIN -- send ACK and enter TIME_WAIT3730 */3731 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3732 sk->shutdown|=SHUTDOWN_MASK;
3733 tcp_set_state(sk,TCP_TIME_WAIT);
3734 break;
3735 caseTCP_CLOSE:
3736 /*3737 * already in CLOSE3738 */3739 break;
3740 default:
3741 tcp_set_state(sk,TCP_LAST_ACK);
3742
3743 /* Start the timers. */3744 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3745 return(0);
3746 }3747
3748 return(0);
3749 }3750
3751
3752
3753 /*3754 * This routine handles the data. If there is room in the buffer,3755 * it will be have already been moved into it. If there is no3756 * room, then we will just have to discard the packet.3757 */3758
3759 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */3760 unsignedlongsaddr, unsignedshortlen)
3761 {3762 structsk_buff *skb1, *skb2;
3763 structtcphdr *th;
3764 intdup_dumped=0;
3765 u32new_seq, shut_seq;
3766
3767 th = skb->h.th;
3768 skb_pull(skb,th->doff*4);
3769 skb_trim(skb,len-(th->doff*4));
3770
3771 /*3772 * The bytes in the receive read/assembly queue has increased. Needed for the3773 * low memory discard algorithm 3774 */3775
3776 sk->bytes_rcv += skb->len;
3777
3778 if (skb->len == 0 && !th->fin)
3779 {3780 /* 3781 * Don't want to keep passing ack's back and forth. 3782 * (someone sent us dataless, boring frame)3783 */3784 if (!th->ack)
3785 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3786 kfree_skb(skb, FREE_READ);
3787 return(0);
3788 }3789
3790 /*3791 * We no longer have anyone receiving data on this connection.3792 */3793
3794 #ifndef TCP_DONT_RST_SHUTDOWN
3795
3796 if(sk->shutdown & RCV_SHUTDOWN)
3797 {3798 /*3799 * FIXME: BSD has some magic to avoid sending resets to3800 * broken 4.2 BSD keepalives. Much to my surprise a few non3801 * BSD stacks still have broken keepalives so we want to3802 * cope with it.3803 */3804
3805 if(skb->len) /* We don't care if it's just an ack or3806 a keepalive/window probe */3807 {3808 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */3809
3810 /* Do this the way 4.4BSD treats it. Not what I'd3811 regard as the meaning of the spec but it's what BSD3812 does and clearly they know everything 8) */3813
3814 /*3815 * This is valid because of two things3816 *3817 * a) The way tcp_data behaves at the bottom.3818 * b) A fin takes effect when read not when received.3819 */3820
3821 shut_seq=sk->acked_seq+1; /* Last byte */3822
3823 if(after(new_seq,shut_seq))
3824 {3825 if(sk->debug)
3826 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
3827 sk, new_seq, shut_seq, sk->blog);
3828 if(sk->dead)
3829 {3830 sk->acked_seq = new_seq + th->fin;
3831 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3832 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3833 tcp_statistics.TcpEstabResets++;
3834 tcp_set_state(sk,TCP_CLOSE);
3835 sk->err = EPIPE;
3836 sk->shutdown = SHUTDOWN_MASK;
3837 kfree_skb(skb, FREE_READ);
3838 return 0;
3839 }3840 }3841 }3842 }3843
3844 #endif3845
3846 /*3847 * Now we have to walk the chain, and figure out where this one3848 * goes into it. This is set up so that the last packet we received3849 * will be the first one we look at, that way if everything comes3850 * in order, there will be no performance loss, and if they come3851 * out of order we will be able to fit things in nicely.3852 *3853 * [AC: This is wrong. We should assume in order first and then walk3854 * forwards from the first hole based upon real traffic patterns.]3855 * 3856 */3857
3858 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */3859 {3860 skb_queue_head(&sk->receive_queue,skb);
3861 skb1= NULL;
3862 }3863 else3864 {3865 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3866 {3867 if(sk->debug)
3868 {3869 printk("skb1=%p :", skb1);
3870 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
3871 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
3872 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
3873 sk->acked_seq);
3874 }3875
3876 /*3877 * Optimisation: Duplicate frame or extension of previous frame from3878 * same sequence point (lost ack case).3879 * The frame contains duplicate data or replaces a previous frame3880 * discard the previous frame (safe as sk->inuse is set) and put3881 * the new one in its place.3882 */3883
3884 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3885 {3886 skb_append(skb1,skb);
3887 skb_unlink(skb1);
3888 kfree_skb(skb1,FREE_READ);
3889 dup_dumped=1;
3890 skb1=NULL;
3891 break;
3892 }3893
3894 /*3895 * Found where it fits3896 */3897
3898 if (after(th->seq+1, skb1->h.th->seq))
3899 {3900 skb_append(skb1,skb);
3901 break;
3902 }3903
3904 /*3905 * See if we've hit the start. If so insert.3906 */3907 if (skb1 == skb_peek(&sk->receive_queue))
3908 {3909 skb_queue_head(&sk->receive_queue, skb);
3910 break;
3911 }3912 }3913 }3914
3915 /*3916 * Figure out what the ack value for this frame is3917 */3918
3919 th->ack_seq = th->seq + skb->len;
3920 if (th->syn)
3921 th->ack_seq++;
3922 if (th->fin)
3923 th->ack_seq++;
3924
3925 if (before(sk->acked_seq, sk->copied_seq))
3926 {3927 printk("*** tcp.c:tcp_data bug acked < copied\n");
3928 sk->acked_seq = sk->copied_seq;
3929 }3930
3931 /*3932 * Now figure out if we can ack anything. This is very messy because we really want two3933 * receive queues, a completed and an assembly queue. We also want only one transmit3934 * queue.3935 */3936
3937 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3938 {3939 if (before(th->seq, sk->acked_seq+1))
3940 {3941 intnewwindow;
3942
3943 if (after(th->ack_seq, sk->acked_seq))
3944 {3945 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3946 if (newwindow < 0)
3947 newwindow = 0;
3948 sk->window = newwindow;
3949 sk->acked_seq = th->ack_seq;
3950 }3951 skb->acked = 1;
3952
3953 /*3954 * When we ack the fin, we do the FIN 3955 * processing.3956 */3957
3958 if (skb->h.th->fin)
3959 {3960 tcp_fin(skb,sk,skb->h.th);
3961 }3962
3963 for(skb2 = skb->next;
3964 skb2 != (structsk_buff *)&sk->receive_queue;
3965 skb2 = skb2->next)
3966 {3967 if (before(skb2->h.th->seq, sk->acked_seq+1))
3968 {3969 if (after(skb2->h.th->ack_seq, sk->acked_seq))
3970 {3971 newwindow = sk->window -
3972 (skb2->h.th->ack_seq - sk->acked_seq);
3973 if (newwindow < 0)
3974 newwindow = 0;
3975 sk->window = newwindow;
3976 sk->acked_seq = skb2->h.th->ack_seq;
3977 }3978 skb2->acked = 1;
3979 /*3980 * When we ack the fin, we do3981 * the fin handling.3982 */3983 if (skb2->h.th->fin)
3984 {3985 tcp_fin(skb,sk,skb->h.th);
3986 }3987
3988 /*3989 * Force an immediate ack.3990 */3991
3992 sk->ack_backlog = sk->max_ack_backlog;
3993 }3994 else3995 {3996 break;
3997 }3998 }3999
4000 /*4001 * This also takes care of updating the window.4002 * This if statement needs to be simplified.4003 */4004 if (!sk->delay_acks ||
4005 sk->ack_backlog >= sk->max_ack_backlog ||
4006 sk->bytes_rcv > sk->max_unacked || th->fin) {4007 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4008 }4009 else4010 {4011 sk->ack_backlog++;
4012 if(sk->debug)
4013 printk("Ack queued.\n");
4014 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4015 }4016 }4017 }4018
4019 /*4020 * If we've missed a packet, send an ack.4021 * Also start a timer to send another.4022 */4023
4024 if (!skb->acked)
4025 {4026
4027 /*4028 * This is important. If we don't have much room left,4029 * we need to throw out a few packets so we have a good4030 * window. Note that mtu is used, not mss, because mss is really4031 * for the send side. He could be sending us stuff as large as mtu.4032 */4033
4034 while (sk->prot->rspace(sk) < sk->mtu)
4035 {4036 skb1 = skb_peek(&sk->receive_queue);
4037 if (skb1 == NULL)
4038 {4039 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4040 break;
4041 }4042
4043 /*4044 * Don't throw out something that has been acked. 4045 */4046
4047 if (skb1->acked)
4048 {4049 break;
4050 }4051
4052 skb_unlink(skb1);
4053 kfree_skb(skb1, FREE_READ);
4054 }4055 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4056 sk->ack_backlog++;
4057 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4058 }4059 else4060 {4061 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4062 }4063
4064 /*4065 * Now tell the user we may have some data. 4066 */4067
4068 if (!sk->dead)
4069 {4070 if(sk->debug)
4071 printk("Data wakeup.\n");
4072 sk->data_ready(sk,0);
4073 }4074 return(0);
4075 }4076
4077
4078 /*4079 * This routine is only called when we have urgent data4080 * signalled. Its the 'slow' part of tcp_urg. It could be4081 * moved inline now as tcp_urg is only called from one4082 * place. We handle URGent data wrong. We have to - as4083 * BSD still doesn't use the correction from RFC961.4084 */4085
4086 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4087 {4088 u32ptr = ntohs(th->urg_ptr);
4089
4090 if (ptr)
4091 ptr--;
4092 ptr += th->seq;
4093
4094 /* ignore urgent data that we've already seen and read */4095 if (after(sk->copied_seq, ptr))
4096 return;
4097
4098 /* do we already have a newer (or duplicate) urgent pointer? */4099 if (sk->urg_data && !after(ptr, sk->urg_seq))
4100 return;
4101
4102 /* tell the world about our new urgent pointer */4103 if (sk->proc != 0) {4104 if (sk->proc > 0) {4105 kill_proc(sk->proc, SIGURG, 1);
4106 }else{4107 kill_pg(-sk->proc, SIGURG, 1);
4108 }4109 }4110 sk->urg_data = URG_NOTYET;
4111 sk->urg_seq = ptr;
4112 }4113
4114 /*4115 * This is the 'fast' part of urgent handling.4116 */4117
4118 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4119 unsignedlongsaddr, unsignedlonglen)
4120 {4121 u32ptr;
4122
4123 /*4124 * Check if we get a new urgent pointer - normally not 4125 */4126
4127 if (th->urg)
4128 tcp_check_urg(sk,th);
4129
4130 /*4131 * Do we wait for any urgent data? - normally not4132 */4133
4134 if (sk->urg_data != URG_NOTYET)
4135 return 0;
4136
4137 /*4138 * Is the urgent pointer pointing into this packet? 4139 */4140
4141 ptr = sk->urg_seq - th->seq + th->doff*4;
4142 if (ptr >= len)
4143 return 0;
4144
4145 /*4146 * Ok, got the correct packet, update info 4147 */4148
4149 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4150 if (!sk->dead)
4151 sk->data_ready(sk,0);
4152 return 0;
4153 }4154
4155 /*4156 * This will accept the next outstanding connection. 4157 */4158
4159 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4160 {4161 structsock *newsk;
4162 structsk_buff *skb;
4163
4164 /*4165 * We need to make sure that this socket is listening,4166 * and that it has something pending.4167 */4168
4169 if (sk->state != TCP_LISTEN)
4170 {4171 sk->err = EINVAL;
4172 return(NULL);
4173 }4174
4175 /* Avoid the race. */4176 cli();
4177 sk->inuse = 1;
4178
4179 while((skb = tcp_dequeue_established(sk)) == NULL)
4180 {4181 if (flags & O_NONBLOCK)
4182 {4183 sti();
4184 release_sock(sk);
4185 sk->err = EAGAIN;
4186 return(NULL);
4187 }4188
4189 release_sock(sk);
4190 interruptible_sleep_on(sk->sleep);
4191 if (current->signal & ~current->blocked)
4192 {4193 sti();
4194 sk->err = ERESTARTSYS;
4195 return(NULL);
4196 }4197 sk->inuse = 1;
4198 }4199 sti();
4200
4201 /*4202 * Now all we need to do is return skb->sk. 4203 */4204
4205 newsk = skb->sk;
4206
4207 kfree_skb(skb, FREE_READ);
4208 sk->ack_backlog--;
4209 release_sock(sk);
4210 return(newsk);
4211 }4212
4213
4214 /*4215 * This will initiate an outgoing connection. 4216 */4217
4218 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4219 {4220 structsk_buff *buff;
4221 structdevice *dev=NULL;
4222 unsignedchar *ptr;
4223 inttmp;
4224 intatype;
4225 structtcphdr *t1;
4226 structrtable *rt;
4227
4228 if (sk->state != TCP_CLOSE)
4229 {4230 return(-EISCONN);
4231 }4232
4233 if (addr_len < 8)
4234 return(-EINVAL);
4235
4236 if (usin->sin_family && usin->sin_family != AF_INET)
4237 return(-EAFNOSUPPORT);
4238
4239 /*4240 * connect() to INADDR_ANY means loopback (BSD'ism).4241 */4242
4243 if(usin->sin_addr.s_addr==INADDR_ANY)
4244 usin->sin_addr.s_addr=ip_my_addr();
4245
4246 /*4247 * Don't want a TCP connection going to a broadcast address 4248 */4249
4250 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4251 return -ENETUNREACH;
4252
4253 sk->inuse = 1;
4254 sk->daddr = usin->sin_addr.s_addr;
4255 sk->write_seq = tcp_init_seq();
4256 sk->window_seq = sk->write_seq;
4257 sk->rcv_ack_seq = sk->write_seq -1;
4258 sk->err = 0;
4259 sk->dummy_th.dest = usin->sin_port;
4260 release_sock(sk);
4261
4262 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4263 if (buff == NULL)
4264 {4265 return(-ENOMEM);
4266 }4267 sk->inuse = 1;
4268 buff->sk = sk;
4269 buff->free = 0;
4270 buff->localroute = sk->localroute;
4271
4272
4273 /*4274 * Put in the IP header and routing stuff. 4275 */4276
4277 rt=ip_rt_route(sk->daddr, NULL, NULL);
4278
4279
4280 /*4281 * We need to build the routing stuff from the things saved in skb. 4282 */4283
4284 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4285 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4286 if (tmp < 0)
4287 {4288 sk->prot->wfree(sk, buff);
4289 release_sock(sk);
4290 return(-ENETUNREACH);
4291 }4292
4293 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
4294
4295 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4296 t1->seq = ntohl(sk->write_seq++);
4297 sk->sent_seq = sk->write_seq;
4298 buff->h.seq = sk->write_seq;
4299 t1->ack = 0;
4300 t1->window = 2;
4301 t1->res1=0;
4302 t1->res2=0;
4303 t1->rst = 0;
4304 t1->urg = 0;
4305 t1->psh = 0;
4306 t1->syn = 1;
4307 t1->urg_ptr = 0;
4308 t1->doff = 6;
4309 /* use 512 or whatever user asked for */4310
4311 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4312 sk->window_clamp=rt->rt_window;
4313 else4314 sk->window_clamp=0;
4315
4316 if (sk->user_mss)
4317 sk->mtu = sk->user_mss;
4318 elseif(rt!=NULL && (rt->rt_flags&RTF_MTU))
4319 sk->mtu = rt->rt_mss;
4320 else4321 {4322 #ifdefCONFIG_INET_SNARL4323 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4324 #else4325 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4326 #endif4327 sk->mtu = 576 - HEADER_SIZE;
4328 else4329 sk->mtu = MAX_WINDOW;
4330 }4331 /*4332 * but not bigger than device MTU 4333 */4334
4335 if(sk->mtu <32)
4336 sk->mtu = 32; /* Sanity limit */4337
4338 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4339
4340 /*4341 * Put in the TCP options to say MTU. 4342 */4343
4344 ptr = skb_put(buff,4);
4345 ptr[0] = 2;
4346 ptr[1] = 4;
4347 ptr[2] = (sk->mtu) >> 8;
4348 ptr[3] = (sk->mtu) & 0xff;
4349 tcp_send_check(t1, sk->saddr, sk->daddr,
4350 sizeof(structtcphdr) + 4, sk);
4351
4352 /*4353 * This must go first otherwise a really quick response will get reset. 4354 */4355
4356 tcp_cache_zap();
4357 tcp_set_state(sk,TCP_SYN_SENT);
4358 if(rt&&rt->rt_flags&RTF_IRTT)
4359 sk->rto = rt->rt_irtt;
4360 else4361 sk->rto = TCP_TIMEOUT_INIT;
4362 sk->retransmit_timer.function=&retransmit_timer;
4363 sk->retransmit_timer.data = (unsignedlong)sk;
4364 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4365 sk->retransmits = 0; /* Now works the right way instead of a hacked initial setting */4366
4367 sk->prot->queue_xmit(sk, dev, buff, 0);
4368 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4369 tcp_statistics.TcpActiveOpens++;
4370 tcp_statistics.TcpOutSegs++;
4371
4372 release_sock(sk);
4373 return(0);
4374 }4375
4376
4377 /* This functions checks to see if the tcp header is actually acceptable. */4378 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4379 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4380 {4381 u32next_seq;
4382
4383 next_seq = len - 4*th->doff;
4384 if (th->fin)
4385 next_seq++;
4386 /* if we have a zero window, we can't have any data in the packet.. */4387 if (next_seq && !sk->window)
4388 gotoignore_it;
4389 next_seq += th->seq;
4390
4391 /*4392 * This isn't quite right. sk->acked_seq could be more recent4393 * than sk->window. This is however close enough. We will accept4394 * slightly more packets than we should, but it should not cause4395 * problems unless someone is trying to forge packets.4396 */4397
4398 /* have we already seen all of this packet? */4399 if (!after(next_seq+1, sk->acked_seq))
4400 gotoignore_it;
4401 /* or does it start beyond the window? */4402 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4403 gotoignore_it;
4404
4405 /* ok, at least part of this packet would seem interesting.. */4406 return 1;
4407
4408 ignore_it:
4409 if (th->rst)
4410 return 0;
4411
4412 /*4413 * Send a reset if we get something not ours and we are4414 * unsynchronized. Note: We don't do anything to our end. We4415 * are just killing the bogus remote connection then we will4416 * connect again and it will work (with luck).4417 */4418
4419 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4420 {4421 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4422 return 1;
4423 }4424
4425 /* Try to resync things. */4426 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4427 return 0;
4428 }4429
4430 /*4431 * When we get a reset we do this.4432 */4433
4434 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4435 {4436 sk->zapped = 1;
4437 sk->err = ECONNRESET;
4438 if (sk->state == TCP_SYN_SENT)
4439 sk->err = ECONNREFUSED;
4440 if (sk->state == TCP_CLOSE_WAIT)
4441 sk->err = EPIPE;
4442 #ifdef TCP_DO_RFC1337
4443 /*4444 * Time wait assassination protection [RFC1337]4445 */4446 if(sk->state!=TCP_TIME_WAIT)
4447 {4448 tcp_set_state(sk,TCP_CLOSE);
4449 sk->shutdown = SHUTDOWN_MASK;
4450 }4451 #else4452 tcp_set_state(sk,TCP_CLOSE);
4453 sk->shutdown = SHUTDOWN_MASK;
4454 #endif4455 if (!sk->dead)
4456 sk->state_change(sk);
4457 kfree_skb(skb, FREE_READ);
4458 release_sock(sk);
4459 return(0);
4460 }4461
4462 /*4463 * A TCP packet has arrived.4464 * skb->h.raw is the TCP header.4465 */4466
4467 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4468 unsignedlongdaddr, unsignedshortlen,
4469 unsignedlongsaddr, intredo, structinet_protocol * protocol)
4470 {4471 structtcphdr *th;
4472 structsock *sk;
4473 intsyn_ok=0;
4474
4475 tcp_statistics.TcpInSegs++;
4476 if(skb->pkt_type!=PACKET_HOST)
4477 {4478 kfree_skb(skb,FREE_READ);
4479 return(0);
4480 }4481
4482 th = skb->h.th;
4483
4484 /*4485 * Find the socket, using the last hit cache if applicable.4486 */4487
4488 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4489 sk=(structsock *)th_cache_sk;
4490 else4491 {4492 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4493 th_cache_saddr=saddr;
4494 th_cache_daddr=daddr;
4495 th_cache_dport=th->dest;
4496 th_cache_sport=th->source;
4497 th_cache_sk=sk;
4498 }4499
4500 /*4501 * If this socket has got a reset it's to all intents and purposes 4502 * really dead. Count closed sockets as dead.4503 *4504 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4505 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4506 * exist so should cause resets as if the port was unreachable.4507 */4508
4509 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4510 sk=NULL;
4511
4512 if (!redo)
4513 {4514 /*4515 * Pull up the IP header.4516 */4517 skb_pull(skb, skb->h.raw-skb->data);
4518 /*4519 * Try to use the device checksum if provided.4520 */4521 if (
4522 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4523 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4524 )
4525 {4526 skb->sk = NULL;
4527 kfree_skb(skb,FREE_READ);
4528 /*4529 * We don't release the socket because it was4530 * never marked in use.4531 */4532 return(0);
4533 }4534 th->seq = ntohl(th->seq);
4535
4536 /* See if we know about the socket. */4537 if (sk == NULL)
4538 {4539 /*4540 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4541 */4542 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4543 skb->sk = NULL;
4544 /*4545 * Discard frame4546 */4547 kfree_skb(skb, FREE_READ);
4548 return(0);
4549 }4550
4551 /* skb->len = len;*/4552 skb->acked = 0;
4553 skb->used = 0;
4554 skb->free = 0;
4555 skb->saddr = daddr;
4556 skb->daddr = saddr;
4557
4558 /* We may need to add it to the backlog here. */4559 cli();
4560 if (sk->inuse)
4561 {4562 skb_queue_tail(&sk->back_log, skb);
4563 sti();
4564 return(0);
4565 }4566 sk->inuse = 1;
4567 sti();
4568 }4569 else4570 {4571 if (sk==NULL)
4572 {4573 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4574 skb->sk = NULL;
4575 kfree_skb(skb, FREE_READ);
4576 return(0);
4577 }4578 }4579
4580
4581 if (!sk->prot)
4582 {4583 printk("IMPOSSIBLE 3\n");
4584 return(0);
4585 }4586
4587
4588 /*4589 * Charge the memory to the socket. 4590 */4591
4592 if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf)
4593 {4594 kfree_skb(skb, FREE_READ);
4595 release_sock(sk);
4596 return(0);
4597 }4598
4599 skb->sk=sk;
4600 sk->rmem_alloc += skb->truesize;
4601
4602 /*4603 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4604 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4605 * compatibility. We also set up variables more thoroughly [Karn notes in the4606 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4607 */4608
4609 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4610 {4611
4612 /*4613 * Now deal with unusual cases.4614 */4615
4616 if(sk->state==TCP_LISTEN)
4617 {4618 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4619 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4620
4621 /*4622 * We don't care for RST, and non SYN are absorbed (old segments)4623 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4624 * netmask on a running connection it can go broadcast. Even Sun's have4625 * this problem so I'm ignoring it 4626 */4627
4628 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4629 {4630 kfree_skb(skb, FREE_READ);
4631 release_sock(sk);
4632 return 0;
4633 }4634
4635 /* 4636 * Guess we need to make a new socket up 4637 */4638
4639 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4640
4641 /*4642 * Now we have several options: In theory there is nothing else4643 * in the frame. KA9Q has an option to send data with the syn,4644 * BSD accepts data with the syn up to the [to be] advertised window4645 * and Solaris 2.1 gives you a protocol error. For now we just ignore4646 * it, that fits the spec precisely and avoids incompatibilities. It4647 * would be nice in future to drop through and process the data.4648 */4649
4650 release_sock(sk);
4651 return 0;
4652 }4653
4654 /* retransmitted SYN? */4655 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4656 {4657 kfree_skb(skb, FREE_READ);
4658 release_sock(sk);
4659 return 0;
4660 }4661
4662 /*4663 * SYN sent means we have to look for a suitable ack and either reset4664 * for bad matches or go to connected 4665 */4666
4667 if(sk->state==TCP_SYN_SENT)
4668 {4669 /* Crossed SYN or previous junk segment */4670 if(th->ack)
4671 {4672 /* We got an ack, but it's not a good ack */4673 if(!tcp_ack(sk,th,saddr,len))
4674 {4675 /* Reset the ack - its an ack from a 4676 different connection [ th->rst is checked in tcp_reset()] */4677 tcp_statistics.TcpAttemptFails++;
4678 tcp_reset(daddr, saddr, th,
4679 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4680 kfree_skb(skb, FREE_READ);
4681 release_sock(sk);
4682 return(0);
4683 }4684 if(th->rst)
4685 returntcp_std_reset(sk,skb);
4686 if(!th->syn)
4687 {4688 /* A valid ack from a different connection4689 start. Shouldn't happen but cover it */4690 kfree_skb(skb, FREE_READ);
4691 release_sock(sk);
4692 return 0;
4693 }4694 /*4695 * Ok.. it's good. Set up sequence numbers and4696 * move to established.4697 */4698 syn_ok=1; /* Don't reset this connection for the syn */4699 sk->acked_seq=th->seq+1;
4700 sk->fin_seq=th->seq;
4701 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4702 tcp_set_state(sk, TCP_ESTABLISHED);
4703 tcp_options(sk,th);
4704 sk->dummy_th.dest=th->source;
4705 sk->copied_seq = sk->acked_seq;
4706 if(!sk->dead)
4707 {4708 sk->state_change(sk);
4709 sock_wake_async(sk->socket, 0);
4710 }4711 if(sk->max_window==0)
4712 {4713 sk->max_window = 32;
4714 sk->mss = min(sk->max_window, sk->mtu);
4715 }4716 }4717 else4718 {4719 /* See if SYN's cross. Drop if boring */4720 if(th->syn && !th->rst)
4721 {4722 /* Crossed SYN's are fine - but talking to4723 yourself is right out... */4724 if(sk->saddr==saddr && sk->daddr==daddr &&
4725 sk->dummy_th.source==th->source &&
4726 sk->dummy_th.dest==th->dest)
4727 {4728 tcp_statistics.TcpAttemptFails++;
4729 returntcp_std_reset(sk,skb);
4730 }4731 tcp_set_state(sk,TCP_SYN_RECV);
4732
4733 /*4734 * FIXME:4735 * Must send SYN|ACK here4736 */4737 }4738 /* Discard junk segment */4739 kfree_skb(skb, FREE_READ);
4740 release_sock(sk);
4741 return 0;
4742 }4743 /*4744 * SYN_RECV with data maybe.. drop through4745 */4746 gotorfc_step6;
4747 }4748
4749 /*4750 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is4751 * a more complex suggestion for fixing these reuse issues in RFC16444752 * but not yet ready for general use. Also see RFC1379.4753 */4754
4755 #defineBSD_TIME_WAIT4756 #ifdefBSD_TIME_WAIT4757 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4758 after(th->seq, sk->acked_seq) && !th->rst)
4759 {4760 u32seq = sk->write_seq;
4761 if(sk->debug)
4762 printk("Doing a BSD time wait\n");
4763 tcp_statistics.TcpEstabResets++;
4764 sk->rmem_alloc -= skb->truesize;
4765 skb->sk = NULL;
4766 sk->err=ECONNRESET;
4767 tcp_set_state(sk, TCP_CLOSE);
4768 sk->shutdown = SHUTDOWN_MASK;
4769 release_sock(sk);
4770 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4771 if (sk && sk->state==TCP_LISTEN)
4772 {4773 sk->inuse=1;
4774 skb->sk = sk;
4775 sk->rmem_alloc += skb->truesize;
4776 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4777 release_sock(sk);
4778 return 0;
4779 }4780 kfree_skb(skb, FREE_READ);
4781 return 0;
4782 }4783 #endif4784 }4785
4786 /*4787 * We are now in normal data flow (see the step list in the RFC)4788 * Note most of these are inline now. I'll inline the lot when4789 * I have time to test it hard and look at what gcc outputs 4790 */4791
4792 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4793 {4794 kfree_skb(skb, FREE_READ);
4795 release_sock(sk);
4796 return 0;
4797 }4798
4799 if(th->rst)
4800 returntcp_std_reset(sk,skb);
4801
4802 /*4803 * !syn_ok is effectively the state test in RFC793.4804 */4805
4806 if(th->syn && !syn_ok)
4807 {4808 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4809 returntcp_std_reset(sk,skb);
4810 }4811
4812 /*4813 * Process the ACK4814 */4815
4816
4817 if(th->ack && !tcp_ack(sk,th,saddr,len))
4818 {4819 /*4820 * Our three way handshake failed.4821 */4822
4823 if(sk->state==TCP_SYN_RECV)
4824 {4825 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4826 }4827 kfree_skb(skb, FREE_READ);
4828 release_sock(sk);
4829 return 0;
4830 }4831
4832 rfc_step6: /* I'll clean this up later */4833
4834 /*4835 * Process urgent data4836 */4837
4838 if(tcp_urg(sk, th, saddr, len))
4839 {4840 kfree_skb(skb, FREE_READ);
4841 release_sock(sk);
4842 return 0;
4843 }4844
4845
4846 /*4847 * Process the encapsulated data4848 */4849
4850 if(tcp_data(skb,sk, saddr, len))
4851 {4852 kfree_skb(skb, FREE_READ);
4853 release_sock(sk);
4854 return 0;
4855 }4856
4857 /*4858 * And done4859 */4860
4861 release_sock(sk);
4862 return 0;
4863 }4864
4865 /*4866 * This routine sends a packet with an out of date sequence4867 * number. It assumes the other end will try to ack it.4868 */4869
4870 staticvoidtcp_write_wakeup(structsock *sk)
/* */4871 {4872 structsk_buff *buff,*skb;
4873 structtcphdr *t1;
4874 structdevice *dev=NULL;
4875 inttmp;
4876
4877 if (sk->zapped)
4878 return; /* After a valid reset we can send no more */4879
4880 /*4881 * Write data can still be transmitted/retransmitted in the4882 * following states. If any other state is encountered, return.4883 * [listen/close will never occur here anyway]4884 */4885
4886 if (sk->state != TCP_ESTABLISHED &&
4887 sk->state != TCP_CLOSE_WAIT &&
4888 sk->state != TCP_FIN_WAIT1 &&
4889 sk->state != TCP_LAST_ACK &&
4890 sk->state != TCP_CLOSING4891 )
4892 {4893 return;
4894 }4895 if ( before(sk->sent_seq, sk->window_seq) &&
4896 (skb=skb_peek(&sk->write_queue)))
4897 {4898 /*4899 * We are probing the opening of a window4900 * but the window size is != 04901 * must have been a result SWS advoidance ( sender )4902 */4903
4904 structiphdr *iph;
4905 structtcphdr *th;
4906 structtcphdr *nth;
4907 unsignedlongwin_size, ow_size;
4908 void * tcp_data_start;
4909
4910 /*4911 * How many bytes can we send ?4912 */4913
4914 win_size = sk->window_seq - sk->sent_seq;
4915
4916 /*4917 * Recover the buffer pointers4918 */4919
4920 iph = (structiphdr *)(skb->data + skb->dev->hard_header_len);
4921 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
4922
4923 /*4924 * Grab the data for a temporary frame4925 */4926
4927 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 +
4928 (iph->ihl << 2) +
4929 skb->dev->hard_header_len + 15,
4930 1, GFP_ATOMIC);
4931 if ( buff == NULL )
4932 return;
4933
4934 /* 4935 * If we strip the packet on the write queue we must4936 * be ready to retransmit this one 4937 */4938
4939 buff->free = /*0*/1;
4940
4941 buff->sk = sk;
4942 buff->localroute = sk->localroute;
4943
4944 /*4945 * Put headers on the new packet4946 */4947
4948 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4949 IPPROTO_TCP, sk->opt, buff->truesize,
4950 sk->ip_tos,sk->ip_ttl);
4951 if (tmp < 0)
4952 {4953 sk->prot->wfree(sk, buff);
4954 return;
4955 }4956
4957 /*4958 * Move the TCP header over4959 */4960
4961 buff->dev = dev;
4962
4963 nth = (structtcphdr *) skb_put(buff,th->doff*4);
4964
4965 memcpy(nth, th, th->doff * 4);
4966
4967 /*4968 * Correct the new header4969 */4970
4971 nth->ack = 1;
4972 nth->ack_seq = ntohl(sk->acked_seq);
4973 nth->window = ntohs(tcp_select_window(sk));
4974 nth->check = 0;
4975
4976 /*4977 * Find the first data byte.4978 */4979
4980 tcp_data_start = skb->data + skb->dev->hard_header_len +
4981 (iph->ihl << 2) + th->doff * 4;
4982
4983 /*4984 * Add it to our new buffer4985 */4986 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
4987
4988 /*4989 * Remember our right edge sequence number.4990 */4991
4992 buff->h.seq = sk->sent_seq + win_size;
4993 sk->sent_seq = buff->h.seq; /* Hack */4994 #if 0
4995
4996 /*4997 * now: shrink the queue head segment 4998 */4999
5000 th->check = 0;
5001 ow_size = skb->len - win_size -
5002 ((unsignedlong) (tcp_data_start - (void *) skb->data));
5003
5004 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5005 skb_trim(skb,skb->len-win_size);
5006 sk->sent_seq += win_size;
5007 th->seq = htonl(sk->sent_seq);
5008 if (th->urg)
5009 {5010 unsignedshorturg_ptr;
5011
5012 urg_ptr = ntohs(th->urg_ptr);
5013 if (urg_ptr <= win_size)
5014 th->urg = 0;
5015 else5016 {5017 urg_ptr -= win_size;
5018 th->urg_ptr = htons(urg_ptr);
5019 nth->urg_ptr = htons(win_size);
5020 }5021 }5022 #else5023 if(th->urg && ntohs(th->urg_ptr) < win_size)
5024 nth->urg = 0;
5025 #endif5026
5027 /*5028 * Checksum the split buffer5029 */5030
5031 tcp_send_check(nth, sk->saddr, sk->daddr,
5032 nth->doff * 4 + win_size , sk);
5033 }5034 else5035 {5036 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5037 if (buff == NULL)
5038 return;
5039
5040 buff->free = 1;
5041 buff->sk = sk;
5042 buff->localroute = sk->localroute;
5043
5044 /*5045 * Put in the IP header and routing stuff. 5046 */5047
5048 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5049 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5050 if (tmp < 0)
5051 {5052 sk->prot->wfree(sk, buff);
5053 return;
5054 }5055
5056 t1 = (structtcphdr *)skb_put(buff,sizeof(structtcphdr));
5057 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5058
5059 /*5060 * Use a previous sequence.5061 * This should cause the other end to send an ack.5062 */5063
5064 t1->seq = htonl(sk->sent_seq-1);
5065 t1->ack = 1;
5066 t1->res1= 0;
5067 t1->res2= 0;
5068 t1->rst = 0;
5069 t1->urg = 0;
5070 t1->psh = 0;
5071 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5072 t1->syn = 0;
5073 t1->ack_seq = ntohl(sk->acked_seq);
5074 t1->window = ntohs(tcp_select_window(sk));
5075 t1->doff = sizeof(*t1)/4;
5076 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5077
5078 }5079
5080 /*5081 * Send it.5082 */5083
5084 sk->prot->queue_xmit(sk, dev, buff, 1);
5085 tcp_statistics.TcpOutSegs++;
5086 }5087
5088 /*5089 * A window probe timeout has occurred.5090 */5091
5092 voidtcp_send_probe0(structsock *sk)
/* */5093 {5094 if (sk->zapped)
5095 return; /* After a valid reset we can send no more */5096
5097 tcp_write_wakeup(sk);
5098
5099 sk->backoff++;
5100 sk->rto = min(sk->rto << 1, 120*HZ);
5101 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5102 sk->retransmits++;
5103 sk->prot->retransmits ++;
5104 }5105
5106 /*5107 * Socket option code for TCP. 5108 */5109
5110 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5111 {5112 intval,err;
5113
5114 if(level!=SOL_TCP)
5115 returnip_setsockopt(sk,level,optname,optval,optlen);
5116
5117 if (optval == NULL)
5118 return(-EINVAL);
5119
5120 err=verify_area(VERIFY_READ, optval, sizeof(int));
5121 if(err)
5122 returnerr;
5123
5124 val = get_user((int *)optval);
5125
5126 switch(optname)
5127 {5128 caseTCP_MAXSEG:
5129 /*5130 * values greater than interface MTU won't take effect. however at5131 * the point when this call is done we typically don't yet know5132 * which interface is going to be used5133 */5134 if(val<1||val>MAX_WINDOW)
5135 return -EINVAL;
5136 sk->user_mss=val;
5137 return 0;
5138 caseTCP_NODELAY:
5139 sk->nonagle=(val==0)?0:1;
5140 return 0;
5141 default:
5142 return(-ENOPROTOOPT);
5143 }5144 }5145
5146 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5147 {5148 intval,err;
5149
5150 if(level!=SOL_TCP)
5151 returnip_getsockopt(sk,level,optname,optval,optlen);
5152
5153 switch(optname)
5154 {5155 caseTCP_MAXSEG:
5156 val=sk->user_mss;
5157 break;
5158 caseTCP_NODELAY:
5159 val=sk->nonagle;
5160 break;
5161 default:
5162 return(-ENOPROTOOPT);
5163 }5164 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5165 if(err)
5166 returnerr;
5167 put_user(sizeof(int),(int *) optlen);
5168
5169 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5170 if(err)
5171 returnerr;
5172 put_user(val,(int *)optval);
5173
5174 return(0);
5175 }5176
5177
5178 structprototcp_prot = {5179 sock_wmalloc,
5180 sock_rmalloc,
5181 sock_wfree,
5182 sock_rfree,
5183 sock_rspace,
5184 sock_wspace,
5185 tcp_close,
5186 tcp_read,
5187 tcp_write,
5188 tcp_sendto,
5189 tcp_recvfrom,
5190 ip_build_header,
5191 tcp_connect,
5192 tcp_accept,
5193 ip_queue_xmit,
5194 tcp_retransmit,
5195 tcp_write_wakeup,
5196 tcp_read_wakeup,
5197 tcp_rcv,
5198 tcp_select,
5199 tcp_ioctl,
5200 NULL,
5201 tcp_shutdown,
5202 tcp_setsockopt,
5203 tcp_getsockopt,
5204 128,
5205 0,
5206 "TCP",
5207 0, 0,
5208 {NULL,}5209 };