1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications 178 * 179 * 180 * To Fix: 181 * Fast path the code. Two things here - fix the window calculation 182 * so it doesn't iterate over the queue, also spot packets with no funny 183 * options arriving in order and process directly. 184 * 185 * Implement RFC 1191 [Path MTU discovery] 186 * Look at the effect of implementing RFC 1337 suggestions and their impact. 187 * Rewrite output state machine to use a single queue and do low window 188 * situations as per the spec (RFC 1122) 189 * Speed up input assembly algorithm. 190 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 191 * could do with it working on IPv4 192 * User settable/learned rtt/max window/mtu 193 * Cope with MTU/device switches when retransmitting in tcp. 194 * Fix the window handling to use PR's new code. 195 * 196 * Change the fundamental structure to a single send queue maintained 197 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 198 * active routes too]). Cut the queue off in tcp_retransmit/ 199 * tcp_transmit. 200 * Change the receive queue to assemble as it goes. This lets us 201 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 202 * tcp_data/tcp_read as well as the window shrink crud. 203 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 204 * tcp_queue_skb seem obvious routines to extract. 205 * 206 * This program is free software; you can redistribute it and/or 207 * modify it under the terms of the GNU General Public License 208 * as published by the Free Software Foundation; either version 209 * 2 of the License, or(at your option) any later version. 210 * 211 * Description of States: 212 * 213 * TCP_SYN_SENT sent a connection request, waiting for ack 214 * 215 * TCP_SYN_RECV received a connection request, sent ack, 216 * waiting for final ack in three-way handshake. 217 * 218 * TCP_ESTABLISHED connection established 219 * 220 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 221 * transmission of remaining buffered data 222 * 223 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 224 * to shutdown 225 * 226 * TCP_CLOSING both sides have shutdown but we still have 227 * data we have to finish sending 228 * 229 * TCP_TIME_WAIT timeout to catch resent junk before entering 230 * closed, can only be entered from FIN_WAIT2 231 * or CLOSING. Required because the other end 232 * may not have gotten our last ACK causing it 233 * to retransmit the data packet (which we ignore) 234 * 235 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 236 * us to finish writing our data and to shutdown 237 * (we have to close() to move on to LAST_ACK) 238 * 239 * TCP_LAST_ACK out side has shutdown after remote has 240 * shutdown. There may still be data in our 241 * buffer that we have to finish sending 242 * 243 * TCP_CLOSE socket is finished 244 */ 245
246 /* 247 * RFC1122 status: 248 * NOTE: I'm not going to be doing comments in the code for this one except 249 * for violations and the like. tcp.c is just too big... If I say something 250 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 251 * with Alan. -- MS 950903 252 * 253 * Use of PSH (4.2.2.2) 254 * MAY aggregate data sent without the PSH flag. (does) 255 * MAY queue data recieved without the PSH flag. (does) 256 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 257 * MAY implement PSH on send calls. (doesn't, thus:) 258 * MUST NOT buffer data indefinitely (doesn't [1 second]) 259 * MUST set PSH on last segment (does) 260 * MAY pass received PSH to application layer (doesn't) 261 * SHOULD send maximum-sized segment whenever possible. (almost always does) 262 * 263 * Window Size (4.2.2.3, 4.2.2.16) 264 * MUST treat window size as an unsigned number (does) 265 * SHOULD treat window size as a 32-bit number (does not) 266 * MUST NOT shrink window once it is offered (does not normally) 267 * 268 * Urgent Pointer (4.2.2.4) 269 * **MUST point urgent pointer to last byte of urgent data (not right 270 * after). (doesn't, to be like BSD) 271 * MUST inform application layer asynchronously of incoming urgent 272 * data. (does) 273 * MUST provide application with means of determining the amount of 274 * urgent data pending. (does) 275 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 276 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 277 * [Follows BSD 1 byte of urgent data] 278 * 279 * TCP Options (4.2.2.5) 280 * MUST be able to recieve TCP options in any segment. (does) 281 * MUST ignore unsupported options (does) 282 * 283 * Maximum Segment Size Option (4.2.2.6) 284 * MUST implement both sending and receiving MSS. (does) 285 * SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send 286 * it always). (does, even when MSS == 536, which is legal) 287 * MUST assume MSS == 536 if no MSS received at connection setup (does) 288 * MUST calculate "effective send MSS" correctly: 289 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 290 * (does - but allows operator override) 291 * 292 * TCP Checksum (4.2.2.7) 293 * MUST generate and check TCP checksum. (does) 294 * 295 * Initial Sequence Number Selection (4.2.2.8) 296 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 297 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 298 * necessary for 10Mbps networks - and harder than BSD to spoof!) 299 * 300 * Simultaneous Open Attempts (4.2.2.10) 301 * MUST support simultaneous open attempts (does) 302 * 303 * Recovery from Old Duplicate SYN (4.2.2.11) 304 * MUST keep track of active vs. passive open (does) 305 * 306 * RST segment (4.2.2.12) 307 * SHOULD allow an RST segment to contain data (does, but doesn't do 308 * anything with it, which is standard) 309 * 310 * Closing a Connection (4.2.2.13) 311 * MUST inform application of whether connectin was closed by RST or 312 * normal close. (does) 313 * MAY allow "half-duplex" close (treat connection as closed for the 314 * local app, even before handshake is done). (does) 315 * MUST linger in TIME_WAIT for 2 * MSL (does) 316 * 317 * Retransmission Timeout (4.2.2.15) 318 * MUST implement Jacobson's slow start and congestion avoidance 319 * stuff. (does) 320 * 321 * Probing Zero Windows (4.2.2.17) 322 * MUST support probing of zero windows. (does) 323 * MAY keep offered window closed indefinitely. (does) 324 * MUST allow remote window to stay closed indefinitely. (does) 325 * 326 * Passive Open Calls (4.2.2.18) 327 * MUST NOT let new passive open affect other connections. (doesn't) 328 * MUST support passive opens (LISTENs) concurrently. (does) 329 * 330 * Time to Live (4.2.2.19) 331 * MUST make TCP TTL configurable. (does - IP_TTL option) 332 * 333 * Event Processing (4.2.2.20) 334 * SHOULD queue out-of-order segments. (does) 335 * MUST aggregate ACK segments whenever possible. (does but badly) 336 * 337 * Retransmission Timeout Calculation (4.2.3.1) 338 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 339 * calculation. (does, or at least explains them in the comments 8*b) 340 * SHOULD initialize RTO to 0 and RTT to 3. (does) 341 * 342 * When to Send an ACK Segment (4.2.3.2) 343 * SHOULD implement delayed ACK. (does not) 344 * MUST keep ACK delay < 0.5 sec. (N/A) 345 * 346 * When to Send a Window Update (4.2.3.3) 347 * MUST implement receiver-side SWS. (does) 348 * 349 * When to Send Data (4.2.3.4) 350 * MUST implement sender-side SWS. (does - imperfectly) 351 * SHOULD implement Nagle algorithm. (does) 352 * 353 * TCP Connection Failures (4.2.3.5) 354 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 355 * SHOULD inform application layer of soft errors. (doesn't) 356 * 357 * TCP Keep-Alives (4.2.3.6) 358 * MAY provide keep-alives. (does) 359 * MUST make keep-alives configurable on a per-connection basis. (does) 360 * MUST default to no keep-alives. (does) 361 * **MUST make keep-alive interval configurable. (doesn't) 362 * **MUST make default keep-alive interval > 2 hours. (doesn't) 363 * MUST NOT interpret failure to ACK keep-alive packet as dead 364 * connection. (doesn't) 365 * SHOULD send keep-alive with no data. (does) 366 * 367 * TCP Multihoming (4.2.3.7) 368 * MUST get source address from IP layer before sending first 369 * SYN. (does) 370 * MUST use same local address for all segments of a connection. (does) 371 * 372 * IP Options (4.2.3.8) 373 * (I don't think the IP layer sees the IP options, yet.) 374 * MUST ignore unsupported IP options. (does, I guess 8*b) 375 * MAY support Time Stamp and Record Route. (doesn't) 376 * **MUST allow application to specify a source route. (doesn't?) 377 * **MUST allow receieved Source Route option to set route for all future 378 * segments on this connection. (doesn't, not that I think it's a 379 * huge problem) 380 * 381 * ICMP messages (4.2.3.9) 382 * MUST act on ICMP errors. (does) 383 * MUST slow transmission upon receipt of a Source Quench. (does) 384 * MUST NOT abort connection upon receipt of soft Destination 385 * Unreachables (0, 1, 5), Time Exceededs and Parameter 386 * Problems. (doesn't) 387 * SHOULD report soft Destination Unreachables etc. to the 388 * application. (doesn't) 389 * SHOULD abort connection upon receipt of hard Destination Unreachable 390 * messages (2, 3, 4). (does) 391 * 392 * Remote Address Validation (4.2.3.10) 393 * MUST reject as an error OPEN for invalid remote IP address. (does) 394 * MUST ignore SYN with invalid source address. (does) 395 * MUST silently discard incoming SYN for broadcast/multicast 396 * address. (does) 397 * 398 * Asynchronous Reports (4.2.4.1) 399 * **MUST provide mechanism for reporting soft errors to application 400 * layer. (doesn't) 401 * 402 * Type of Service (4.2.4.2) 403 * MUST allow application layer to set Type of Service. (does IP_TOS) 404 * 405 * (Whew. -- MS 950903) 406 **/ 407
408 #include <linux/types.h>
409 #include <linux/sched.h>
410 #include <linux/mm.h>
411 #include <linux/time.h>
412 #include <linux/string.h>
413 #include <linux/config.h>
414 #include <linux/socket.h>
415 #include <linux/sockios.h>
416 #include <linux/termios.h>
417 #include <linux/in.h>
418 #include <linux/fcntl.h>
419 #include <linux/inet.h>
420 #include <linux/netdevice.h>
421 #include <net/snmp.h>
422 #include <net/ip.h>
423 #include <net/protocol.h>
424 #include <net/icmp.h>
425 #include <net/tcp.h>
426 #include <net/arp.h>
427 #include <linux/skbuff.h>
428 #include <net/sock.h>
429 #include <net/route.h>
430 #include <linux/errno.h>
431 #include <linux/timer.h>
432 #include <asm/system.h>
433 #include <asm/segment.h>
434 #include <linux/mm.h>
435 #include <net/checksum.h>
436
437 /* 438 * The MSL timer is the 'normal' timer. 439 */ 440
441 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
442
443 #define SEQ_TICK 3
444 unsignedlongseq_offset;
445 structtcp_mibtcp_statistics;
446
447 /* 448 * Cached last hit socket 449 */ 450
451 volatileunsignedlongth_cache_saddr,th_cache_daddr;
452 volatileunsignedshortth_cache_dport, th_cache_sport;
453 volatilestructsock *th_cache_sk;
454
455 voidtcp_cache_zap(void)
/* */ 456 { 457 unsignedlongflags;
458 save_flags(flags);
459 cli();
460 th_cache_saddr=0;
461 th_cache_daddr=0;
462 th_cache_dport=0;
463 th_cache_sport=0;
464 th_cache_sk=NULL;
465 restore_flags(flags);
466 } 467
468 staticvoidtcp_close(structsock *sk, inttimeout);
469
470
471 /* 472 * The less said about this the better, but it works and will do for 1.2 473 */ 474
475 staticstructwait_queue *master_select_wakeup;
476
477 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 478 { 479 if (a < b)
480 return(a);
481 return(b);
482 } 483
484 #undefSTATE_TRACE 485
486 #ifdefSTATE_TRACE 487 staticchar *statename[]={ 488 "Unused","Established","Syn Sent","Syn Recv",
489 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
490 "Close Wait","Last ACK","Listen","Closing"
491 };
492 #endif 493
494 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 495 { 496 if(sk->state==TCP_ESTABLISHED)
497 tcp_statistics.TcpCurrEstab--;
498 #ifdefSTATE_TRACE 499 if(sk->debug)
500 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
501 #endif 502 /* This is a hack but it doesn't occur often and it's going to 503 be a real to fix nicely */ 504
505 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
506 { 507 wake_up_interruptible(&master_select_wakeup);
508 } 509 sk->state=state;
510 if(state==TCP_ESTABLISHED)
511 tcp_statistics.TcpCurrEstab++;
512 } 513
514 /* 515 * This routine picks a TCP windows for a socket based on 516 * the following constraints 517 * 518 * 1. The window can never be shrunk once it is offered (RFC 793) 519 * 2. We limit memory per socket 520 * 521 * For now we use NET2E3's heuristic of offering half the memory 522 * we have handy. All is not as bad as this seems however because 523 * of two things. Firstly we will bin packets even within the window 524 * in order to get the data we are waiting for into the memory limit. 525 * Secondly we bin common duplicate forms at receive time 526 * Better heuristics welcome 527 */ 528
529 inttcp_select_window(structsock *sk)
/* */ 530 { 531 intnew_window = sk->prot->rspace(sk);
532
533 if(sk->window_clamp)
534 new_window=min(sk->window_clamp,new_window);
535 /* 536 * Two things are going on here. First, we don't ever offer a 537 * window less than min(sk->mss, MAX_WINDOW/2). This is the 538 * receiver side of SWS as specified in RFC1122. 539 * Second, we always give them at least the window they 540 * had before, in order to avoid retracting window. This 541 * is technically allowed, but RFC1122 advises against it and 542 * in practice it causes trouble. 543 * 544 * Fixme: This doesn't correctly handle the case where 545 * new_window > sk->window but not by enough to allow for the 546 * shift in sequence space. 547 */ 548 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
549 return(sk->window);
550 return(new_window);
551 } 552
553 /* 554 * Find someone to 'accept'. Must be called with 555 * sk->inuse=1 or cli() 556 */ 557
558 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 559 { 560 structsk_buff *p=skb_peek(&s->receive_queue);
561 if(p==NULL)
562 returnNULL;
563 do 564 { 565 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
566 returnp;
567 p=p->next;
568 } 569 while(p!=(structsk_buff *)&s->receive_queue);
570 returnNULL;
571 } 572
573 /* 574 * Remove a completed connection and return it. This is used by 575 * tcp_accept() to get connections from the queue. 576 */ 577
578 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 579 { 580 structsk_buff *skb;
581 unsignedlongflags;
582 save_flags(flags);
583 cli();
584 skb=tcp_find_established(s);
585 if(skb!=NULL)
586 skb_unlink(skb); /* Take it off the queue */ 587 restore_flags(flags);
588 returnskb;
589 } 590
591 /* 592 * This routine closes sockets which have been at least partially 593 * opened, but not yet accepted. Currently it is only called by 594 * tcp_close, and timeout mirrors the value there. 595 */ 596
597 staticvoidtcp_close_pending (structsock *sk)
/* */ 598 { 599 structsk_buff *skb;
600
601 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
602 { 603 skb->sk->dead=1;
604 tcp_close(skb->sk, 0);
605 kfree_skb(skb, FREE_READ);
606 } 607 return;
608 } 609
610 /* 611 * Enter the time wait state. 612 */ 613
614 staticvoidtcp_time_wait(structsock *sk)
/* */ 615 { 616 tcp_set_state(sk,TCP_TIME_WAIT);
617 sk->shutdown = SHUTDOWN_MASK;
618 if (!sk->dead)
619 sk->state_change(sk);
620 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
621 } 622
623 /* 624 * A socket has timed out on its send queue and wants to do a 625 * little retransmitting. Currently this means TCP. 626 */ 627
628 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 629 { 630 structsk_buff * skb;
631 structproto *prot;
632 structdevice *dev;
633 intct=0;
634 structrtable *rt;
635
636 prot = sk->prot;
637 skb = sk->send_head;
638
639 while (skb != NULL)
640 { 641 structtcphdr *th;
642 structiphdr *iph;
643 intsize;
644
645 dev = skb->dev;
646 IS_SKB(skb);
647 skb->when = jiffies;
648
649 /* 650 * Discard the surplus MAC header 651 */ 652
653 skb_pull(skb,((unsignedchar *)skb->ip_hdr)-skb->data);
654
655 /* 656 * In general it's OK just to use the old packet. However we 657 * need to use the current ack and window fields. Urg and 658 * urg_ptr could possibly stand to be updated as well, but we 659 * don't keep the necessary data. That shouldn't be a problem, 660 * if the other end is doing the right thing. Since we're 661 * changing the packet, we have to issue a new IP identifier. 662 */ 663
664 iph = (structiphdr *)skb->data;
665 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
666 size = ntohs(iph->tot_len) - (iph->ihl<<2);
667
668 /* 669 * Note: We ought to check for window limits here but 670 * currently this is done (less efficiently) elsewhere. 671 */ 672
673 iph->id = htons(ip_id_count++);
674 ip_send_check(iph);
675
676 /* 677 * Put a MAC header back on (may cause ARPing) 678 */ 679
680 if(skb->localroute)
681 rt=ip_rt_local(iph->daddr,NULL,NULL);
682 else 683 rt=ip_rt_route(iph->daddr,NULL,NULL);
684
685 if(rt==NULL) /* Deep poo */ 686 { 687 if(skb->sk)
688 { 689 skb->sk->err=ENETUNREACH;
690 skb->sk->error_report(skb->sk);
691 } 692 } 693 else 694 { 695 dev=rt->rt_dev;
696 skb->raddr=rt->rt_gateway;
697 if(skb->raddr==0)
698 skb->raddr=iph->daddr;
699 skb->dev=dev;
700 skb->arp=1;
701 if(dev->hard_header)
702 { 703 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
704 skb->arp=0;
705 } 706
707 /* 708 * This is not the right way to handle this. We have to 709 * issue an up to date window and ack report with this 710 * retransmit to keep the odd buggy tcp that relies on 711 * the fact BSD does this happy. 712 * We don't however need to recalculate the entire 713 * checksum, so someone wanting a small problem to play 714 * with might like to implement RFC1141/RFC1624 and speed 715 * this up by avoiding a full checksum. 716 */ 717
718 th->ack_seq = ntohl(sk->acked_seq);
719 th->window = ntohs(tcp_select_window(sk));
720 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
721
722 /* 723 * If the interface is (still) up and running, kick it. 724 */ 725
726 if (dev->flags & IFF_UP)
727 { 728 /* 729 * If the packet is still being sent by the device/protocol 730 * below then don't retransmit. This is both needed, and good - 731 * especially with connected mode AX.25 where it stops resends 732 * occurring of an as yet unsent anyway frame! 733 * We still add up the counts as the round trip time wants 734 * adjusting. 735 */ 736 if (sk && !skb_device_locked(skb))
737 { 738 /* Remove it from any existing driver queue first! */ 739 skb_unlink(skb);
740 /* Now queue it */ 741 ip_statistics.IpOutRequests++;
742 dev_queue_xmit(skb, dev, sk->priority);
743 } 744 } 745 } 746
747 /* 748 * Count retransmissions 749 */ 750
751 ct++;
752 sk->prot->retransmits ++;
753 tcp_statistics.TcpRetransSegs++;
754
755
756 /* 757 * Only one retransmit requested. 758 */ 759
760 if (!all)
761 break;
762
763 /* 764 * This should cut it off before we send too many packets. 765 */ 766
767 if (ct >= sk->cong_window)
768 break;
769 skb = skb->link3;
770 } 771 } 772
773 /* 774 * Reset the retransmission timer 775 */ 776
777 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 778 { 779 del_timer(&sk->retransmit_timer);
780 sk->ip_xmit_timeout = why;
781 if((int)when < 0)
782 { 783 when=3;
784 printk("Error: Negative timer in xmit_timer\n");
785 } 786 sk->retransmit_timer.expires=jiffies+when;
787 add_timer(&sk->retransmit_timer);
788 } 789
790 /* 791 * This is the normal code called for timeouts. It does the retransmission 792 * and then does backoff. tcp_do_retransmit is separated out because 793 * tcp_ack needs to send stuff from the retransmit queue without 794 * initiating a backoff. 795 */ 796
797
798 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 799 { 800 tcp_do_retransmit(sk, all);
801
802 /* 803 * Increase the timeout each time we retransmit. Note that 804 * we do not increase the rtt estimate. rto is initialized 805 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 806 * that doubling rto each time is the least we can get away with. 807 * In KA9Q, Karn uses this for the first few times, and then 808 * goes to quadratic. netBSD doubles, but only goes up to *64, 809 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 810 * defined in the protocol as the maximum possible RTT. I guess 811 * we'll have to use something other than TCP to talk to the 812 * University of Mars. 813 * 814 * PAWS allows us longer timeouts and large windows, so once 815 * implemented ftp to mars will work nicely. We will have to fix 816 * the 120 second clamps though! 817 */ 818
819 sk->retransmits++;
820 sk->prot->retransmits++;
821 sk->backoff++;
822 sk->rto = min(sk->rto << 1, 120*HZ);
823 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
824 } 825
826
827 /* 828 * A timer event has trigger a tcp retransmit timeout. The 829 * socket xmit queue is ready and set up to send. Because 830 * the ack receive code keeps the queue straight we do 831 * nothing clever here. 832 */ 833
834 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 835 { 836 if (all)
837 { 838 tcp_retransmit_time(sk, all);
839 return;
840 } 841
842 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 843 /* sk->ssthresh in theory can be zero. I guess that's OK */ 844 sk->cong_count = 0;
845
846 sk->cong_window = 1;
847
848 /* Do the actual retransmit. */ 849 tcp_retransmit_time(sk, all);
850 } 851
852 /* 853 * A write timeout has occurred. Process the after effects. 854 */ 855
856 staticinttcp_write_timeout(structsock *sk)
/* */ 857 { 858 /* 859 * Look for a 'soft' timeout. 860 */ 861 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
862 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
863 { 864 /* 865 * Attempt to recover if arp has changed (unlikely!) or 866 * a route has shifted (not supported prior to 1.3). 867 */ 868 arp_destroy (sk->daddr, 0);
869 /*ip_route_check (sk->daddr);*/ 870 } 871
872 /* 873 * Have we tried to SYN too many times (repent repent 8)) 874 */ 875
876 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
877 { 878 sk->err=ETIMEDOUT;
879 sk->error_report(sk);
880 del_timer(&sk->retransmit_timer);
881 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ 882 tcp_set_state(sk,TCP_CLOSE);
883 /* Don't FIN, we got nothing back */ 884 release_sock(sk);
885 return 0;
886 } 887 /* 888 * Has it gone just too far ? 889 */ 890 if (sk->retransmits > TCP_RETR2)
891 { 892 sk->err = ETIMEDOUT;
893 sk->error_report(sk);
894 del_timer(&sk->retransmit_timer);
895 /* 896 * Time wait the socket 897 */ 898 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
899 { 900 tcp_set_state(sk,TCP_TIME_WAIT);
901 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
902 } 903 else 904 { 905 /* 906 * Clean up time. 907 */ 908 tcp_set_state(sk, TCP_CLOSE);
909 release_sock(sk);
910 return 0;
911 } 912 } 913 return 1;
914 } 915
916 /* 917 * The TCP retransmit timer. This lacks a few small details. 918 * 919 * 1. An initial rtt timeout on the probe0 should cause what we can 920 * of the first write queue buffer to be split and sent. 921 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 922 * ETIMEDOUT if we know an additional 'soft' error caused this. 923 * tcp_err should save a 'soft error' for us. 924 */ 925
926 staticvoidretransmit_timer(unsignedlongdata)
/* */ 927 { 928 structsock *sk = (structsock*)data;
929 intwhy = sk->ip_xmit_timeout;
930
931 /* 932 * only process if socket is not in use 933 */ 934
935 cli();
936 if (sk->inuse || in_bh)
937 { 938 /* Try again in 1 second */ 939 sk->retransmit_timer.expires = jiffies+HZ;
940 add_timer(&sk->retransmit_timer);
941 sti();
942 return;
943 } 944
945 sk->inuse = 1;
946 sti();
947
948 /* Always see if we need to send an ack. */ 949
950 if (sk->ack_backlog && !sk->zapped)
951 { 952 sk->prot->read_wakeup (sk);
953 if (! sk->dead)
954 sk->data_ready(sk,0);
955 } 956
957 /* Now we need to figure out why the socket was on the timer. */ 958
959 switch (why)
960 { 961 /* Window probing */ 962 caseTIME_PROBE0:
963 tcp_send_probe0(sk);
964 tcp_write_timeout(sk);
965 break;
966 /* Retransmitting */ 967 caseTIME_WRITE:
968 /* It could be we got here because we needed to send an ack. 969 * So we need to check for that. 970 */ 971 { 972 structsk_buff *skb;
973 unsignedlongflags;
974
975 save_flags(flags);
976 cli();
977 skb = sk->send_head;
978 if (!skb)
979 { 980 restore_flags(flags);
981 } 982 else 983 { 984 /* 985 * Kicked by a delayed ack. Reset timer 986 * correctly now 987 */ 988 if (jiffies < skb->when + sk->rto)
989 { 990 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
991 restore_flags(flags);
992 break;
993 } 994 restore_flags(flags);
995 /* 996 * Retransmission 997 */ 998 sk->retransmits++;
999 sk->prot->retransmits++;
1000 sk->prot->retransmit (sk, 0);
1001 tcp_write_timeout(sk);
1002 }1003 break;
1004 }1005 /* Sending Keepalives */1006 caseTIME_KEEPOPEN:
1007 /* 1008 * this reset_timer() call is a hack, this is not1009 * how KEEPOPEN is supposed to work.1010 */1011 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1012
1013 /* Send something to keep the connection open. */1014 if (sk->prot->write_wakeup)
1015 sk->prot->write_wakeup (sk);
1016 sk->retransmits++;
1017 sk->prot->retransmits++;
1018 tcp_write_timeout(sk);
1019 break;
1020 default:
1021 printk ("rexmit_timer: timer expired - reason unknown\n");
1022 break;
1023 }1024 release_sock(sk);
1025 }1026
1027 /*1028 * This routine is called by the ICMP module when it gets some1029 * sort of error condition. If err < 0 then the socket should1030 * be closed and the error returned to the user. If err > 01031 * it's just the icmp type << 8 | icmp code. After adjustment1032 * header points to the first 8 bytes of the tcp header. We need1033 * to find the appropriate port.1034 */1035
1036 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */1037 __u32saddr, structinet_protocol *protocol)
1038 {1039 structtcphdr *th;
1040 structsock *sk;
1041 structiphdr *iph=(structiphdr *)header;
1042
1043 header+=4*iph->ihl;
1044
1045
1046 th =(structtcphdr *)header;
1047 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1048
1049 if (sk == NULL)
1050 return;
1051
1052 if (type == ICMP_SOURCE_QUENCH)
1053 {1054 /*1055 * FIXME:1056 * For now we will just trigger a linear backoff.1057 * The slow start code should cause a real backoff here.1058 */1059 if (sk->cong_window > 4)
1060 sk->cong_window--;
1061 return;
1062 }1063
1064 if (type == ICMP_PARAMETERPROB)
1065 {1066 sk->err=EPROTO;
1067 sk->error_report(sk);
1068 }1069
1070 /*1071 * If we've already connected we will keep trying1072 * until we time out, or the user gives up.1073 */1074
1075 if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1076 {1077 sk->err = icmp_err_convert[code].errno;
1078 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1079 {1080 tcp_statistics.TcpAttemptFails++;
1081 tcp_set_state(sk,TCP_CLOSE);
1082 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */1083 }1084 }1085 return;
1086 }1087
1088
1089 /*1090 * Walk down the receive queue counting readable data until we hit the end or we find a gap1091 * in the received data queue (ie a frame missing that needs sending to us). Not1092 * sorting using two queues as data arrives makes life so much harder.1093 */1094
1095 staticinttcp_readable(structsock *sk)
/* */1096 {1097 unsignedlongcounted;
1098 unsignedlongamount;
1099 structsk_buff *skb;
1100 intsum;
1101 unsignedlongflags;
1102
1103 if(sk && sk->debug)
1104 printk("tcp_readable: %p - ",sk);
1105
1106 save_flags(flags);
1107 cli();
1108 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1109 {1110 restore_flags(flags);
1111 if(sk && sk->debug)
1112 printk("empty\n");
1113 return(0);
1114 }1115
1116 counted = sk->copied_seq; /* Where we are at the moment */1117 amount = 0;
1118
1119 /* 1120 * Do until a push or until we are out of data. 1121 */1122
1123 do1124 {1125 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */1126 break;
1127 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */1128 if (skb->h.th->syn)
1129 sum++;
1130 if (sum > 0)
1131 {/* Add it up, move on */1132 amount += sum;
1133 if (skb->h.th->syn)
1134 amount--;
1135 counted += sum;
1136 }1137 /*1138 * Don't count urg data ... but do it in the right place!1139 * Consider: "old_data (ptr is here) URG PUSH data"1140 * The old code would stop at the first push because1141 * it counted the urg (amount==1) and then does amount--1142 * *after* the loop. This means tcp_readable() always1143 * returned zero if any URG PUSH was in the queue, even1144 * though there was normal data available. If we subtract1145 * the urg data right here, we even get it to work for more1146 * than one URG PUSH skb without normal data.1147 * This means that select() finally works now with urg data1148 * in the queue. Note that rlogin was never affected1149 * because it doesn't use select(); it uses two processes1150 * and a blocking read(). And the queue scan in tcp_read()1151 * was correct. Mike <pall@rz.uni-karlsruhe.de>1152 */1153 if (skb->h.th->urg)
1154 amount--; /* don't count urg data */1155 if (amount && skb->h.th->psh) break;
1156 skb = skb->next;
1157 }1158 while(skb != (structsk_buff *)&sk->receive_queue);
1159
1160 restore_flags(flags);
1161 if(sk->debug)
1162 printk("got %lu bytes.\n",amount);
1163 return(amount);
1164 }1165
1166 /*1167 * LISTEN is a special case for select..1168 */1169 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */1170 {1171 if (sel_type == SEL_IN) {1172 intretval;
1173
1174 sk->inuse = 1;
1175 retval = (tcp_find_established(sk) != NULL);
1176 release_sock(sk);
1177 if (!retval)
1178 select_wait(&master_select_wakeup,wait);
1179 returnretval;
1180 }1181 return 0;
1182 }1183
1184
1185 /*1186 * Wait for a TCP event.1187 *1188 * Note that we don't need to set "sk->inuse", as the upper select layers1189 * take care of normal races (between the test and the event) and we don't1190 * go look at any of the socket buffers directly.1191 */1192 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */1193 {1194 if (sk->state == TCP_LISTEN)
1195 returntcp_listen_select(sk, sel_type, wait);
1196
1197 switch(sel_type) {1198 caseSEL_IN:
1199 if (sk->err)
1200 return 1;
1201 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1202 break;
1203
1204 if (sk->shutdown & RCV_SHUTDOWN)
1205 return 1;
1206
1207 if (sk->acked_seq == sk->copied_seq)
1208 break;
1209
1210 if (sk->urg_seq != sk->copied_seq ||
1211 sk->acked_seq != sk->copied_seq+1 ||
1212 sk->urginline || !sk->urg_data)
1213 return 1;
1214 break;
1215
1216 caseSEL_OUT:
1217 if (sk->err)
1218 return 1;
1219 if (sk->shutdown & SEND_SHUTDOWN)
1220 return 0;
1221 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1222 break;
1223 /*1224 * This is now right thanks to a small fix1225 * by Matt Dillon.1226 */1227
1228 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1229 break;
1230 return 1;
1231
1232 caseSEL_EX:
1233 if (sk->urg_data)
1234 return 1;
1235 break;
1236 }1237 select_wait(sk->sleep, wait);
1238 return 0;
1239 }1240
1241 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */1242 {1243 interr;
1244 switch(cmd)
1245 {1246
1247 caseTIOCINQ:
1248 #ifdef FIXME /* FIXME: */1249 caseFIONREAD:
1250 #endif1251 {1252 unsignedlongamount;
1253
1254 if (sk->state == TCP_LISTEN)
1255 return(-EINVAL);
1256
1257 sk->inuse = 1;
1258 amount = tcp_readable(sk);
1259 release_sock(sk);
1260 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1261 if(err)
1262 returnerr;
1263 put_user(amount, (int *)arg);
1264 return(0);
1265 }1266 caseSIOCATMARK:
1267 {1268 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1269
1270 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1271 if (err)
1272 returnerr;
1273 put_user(answ,(int *) arg);
1274 return(0);
1275 }1276 caseTIOCOUTQ:
1277 {1278 unsignedlongamount;
1279
1280 if (sk->state == TCP_LISTEN) return(-EINVAL);
1281 amount = sk->prot->wspace(sk);
1282 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1283 if(err)
1284 returnerr;
1285 put_user(amount, (int *)arg);
1286 return(0);
1287 }1288 default:
1289 return(-EINVAL);
1290 }1291 }1292
1293
1294 /*1295 * This routine computes a TCP checksum. 1296 *1297 * Modified January 1995 from a go-faster DOS routine by1298 * Jorge Cwik <jorge@laser.satlink.net>1299 */1300
1301 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1302 unsignedlongsaddr, unsignedlongdaddr, unsignedlongbase)
1303 {1304 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1305 }1306
1307
1308
1309 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1310 unsignedlongdaddr, intlen, structsock *sk)
1311 {1312 th->check = 0;
1313 th->check = tcp_check(th, len, saddr, daddr,
1314 csum_partial((char *)th,len,0));
1315 return;
1316 }1317
1318 /*1319 * This is the main buffer sending routine. We queue the buffer1320 * having checked it is sane seeming.1321 */1322
1323 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1324 {1325 intsize;
1326 structtcphdr * th = skb->h.th;
1327
1328 /*1329 * length of packet (not counting length of pre-tcp headers) 1330 */1331
1332 size = skb->len - ((unsignedchar *) th - skb->data);
1333
1334 /*1335 * Sanity check it.. 1336 */1337
1338 if (size < sizeof(structtcphdr) || size > skb->len)
1339 {1340 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1341 skb, skb->data, th, skb->len);
1342 kfree_skb(skb, FREE_WRITE);
1343 return;
1344 }1345
1346 /*1347 * If we have queued a header size packet.. (these crash a few1348 * tcp stacks if ack is not set)1349 */1350
1351 if (size == sizeof(structtcphdr))
1352 {1353 /* If it's got a syn or fin it's notionally included in the size..*/1354 if(!th->syn && !th->fin)
1355 {1356 printk("tcp_send_skb: attempt to queue a bogon.\n");
1357 kfree_skb(skb,FREE_WRITE);
1358 return;
1359 }1360 }1361
1362 /*1363 * Actual processing.1364 */1365
1366 tcp_statistics.TcpOutSegs++;
1367 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1368
1369 /*1370 * We must queue if1371 *1372 * a) The right edge of this frame exceeds the window1373 * b) We are retransmitting (Nagle's rule)1374 * c) We have too many packets 'in flight'1375 */1376
1377 if (after(skb->h.seq, sk->window_seq) ||
1378 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1379 sk->packets_out >= sk->cong_window)
1380 {1381 /* checksum will be supplied by tcp_write_xmit. So1382 * we shouldn't need to set it at all. I'm being paranoid */1383 th->check = 0;
1384 if (skb->next != NULL)
1385 {1386 printk("tcp_send_partial: next != NULL\n");
1387 skb_unlink(skb);
1388 }1389 skb_queue_tail(&sk->write_queue, skb);
1390
1391 /*1392 * If we don't fit we have to start the zero window1393 * probes. This is broken - we really need to do a partial1394 * send _first_ (This is what causes the Cisco and PC/TCP1395 * grief).1396 */1397
1398 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1399 sk->send_head == NULL && sk->ack_backlog == 0)
1400 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1401 }1402 else1403 {1404 /*1405 * This is going straight out1406 */1407
1408 th->ack_seq = ntohl(sk->acked_seq);
1409 th->window = ntohs(tcp_select_window(sk));
1410
1411 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1412
1413 sk->sent_seq = sk->write_seq;
1414
1415 /*1416 * This is mad. The tcp retransmit queue is put together1417 * by the ip layer. This causes half the problems with1418 * unroutable FIN's and other things.1419 */1420
1421 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1422
1423 /*1424 * Set for next retransmit based on expected ACK time.1425 * FIXME: We set this every time which means our 1426 * retransmits are really about a window behind.1427 */1428
1429 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1430 }1431 }1432
1433 /*1434 * Locking problems lead us to a messy situation where we can have1435 * multiple partially complete buffers queued up. This is really bad1436 * as we don't want to be sending partial buffers. Fix this with1437 * a semaphore or similar to lock tcp_write per socket.1438 *1439 * These routines are pretty self descriptive.1440 */1441
1442 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1443 {1444 structsk_buff * skb;
1445 unsignedlongflags;
1446
1447 save_flags(flags);
1448 cli();
1449 skb = sk->partial;
1450 if (skb) {1451 sk->partial = NULL;
1452 del_timer(&sk->partial_timer);
1453 }1454 restore_flags(flags);
1455 returnskb;
1456 }1457
1458 /*1459 * Empty the partial queue1460 */1461
1462 staticvoidtcp_send_partial(structsock *sk)
/* */1463 {1464 structsk_buff *skb;
1465
1466 if (sk == NULL)
1467 return;
1468 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1469 tcp_send_skb(sk, skb);
1470 }1471
1472 /*1473 * Queue a partial frame1474 */1475
1476 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1477 {1478 structsk_buff * tmp;
1479 unsignedlongflags;
1480
1481 save_flags(flags);
1482 cli();
1483 tmp = sk->partial;
1484 if (tmp)
1485 del_timer(&sk->partial_timer);
1486 sk->partial = skb;
1487 init_timer(&sk->partial_timer);
1488 /*1489 * Wait up to 1 second for the buffer to fill.1490 */1491 sk->partial_timer.expires = jiffies+HZ;
1492 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1493 sk->partial_timer.data = (unsignedlong) sk;
1494 add_timer(&sk->partial_timer);
1495 restore_flags(flags);
1496 if (tmp)
1497 tcp_send_skb(sk, tmp);
1498 }1499
1500
1501 /*1502 * This routine sends an ack and also updates the window. 1503 */1504
1505 staticvoidtcp_send_ack(u32sequence, u32ack,
/* */1506 structsock *sk,
1507 structtcphdr *th, unsignedlongdaddr)
1508 {1509 structsk_buff *buff;
1510 structtcphdr *t1;
1511 structdevice *dev = NULL;
1512 inttmp;
1513
1514 if(sk->zapped)
1515 return; /* We have been reset, we may not send again */1516
1517 /*1518 * We need to grab some memory, and put together an ack,1519 * and then put it into the queue to be sent.1520 */1521
1522 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1523 if (buff == NULL)
1524 {1525 /* 1526 * Force it to send an ack. We don't have to do this1527 * (ACK is unreliable) but it's much better use of 1528 * bandwidth on slow links to send a spare ack than1529 * resend packets. 1530 */1531
1532 sk->ack_backlog++;
1533 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1534 {1535 reset_xmit_timer(sk, TIME_WRITE, HZ);
1536 }1537 return;
1538 }1539
1540 /*1541 * Assemble a suitable TCP frame1542 */1543
1544 buff->sk = sk;
1545 buff->localroute = sk->localroute;
1546
1547 /* 1548 * Put in the IP header and routing stuff. 1549 */1550
1551 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1552 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1553 if (tmp < 0)
1554 {1555 buff->free = 1;
1556 sk->prot->wfree(sk, buff);
1557 return;
1558 }1559 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1560
1561 memcpy(t1, th, sizeof(*t1));
1562
1563 /*1564 * Swap the send and the receive. 1565 */1566
1567 t1->dest = th->source;
1568 t1->source = th->dest;
1569 t1->seq = ntohl(sequence);
1570 t1->ack = 1;
1571 sk->window = tcp_select_window(sk);
1572 t1->window = ntohs(sk->window);
1573 t1->res1 = 0;
1574 t1->res2 = 0;
1575 t1->rst = 0;
1576 t1->urg = 0;
1577 t1->syn = 0;
1578 t1->psh = 0;
1579 t1->fin = 0;
1580
1581 /*1582 * If we have nothing queued for transmit and the transmit timer1583 * is on we are just doing an ACK timeout and need to switch1584 * to a keepalive.1585 */1586
1587 if (ack == sk->acked_seq)
1588 {1589 sk->ack_backlog = 0;
1590 sk->bytes_rcv = 0;
1591 sk->ack_timed = 0;
1592 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1593 && sk->ip_xmit_timeout == TIME_WRITE)
1594 {1595 if(sk->keepopen) {1596 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1597 }else{1598 delete_timer(sk);
1599 }1600 }1601 }1602
1603 /*1604 * Fill in the packet and send it1605 */1606
1607 t1->ack_seq = ntohl(ack);
1608 t1->doff = sizeof(*t1)/4;
1609 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1610 if (sk->debug)
1611 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1612 tcp_statistics.TcpOutSegs++;
1613 sk->prot->queue_xmit(sk, dev, buff, 1);
1614 }1615
1616
1617 /* 1618 * This routine builds a generic TCP header. 1619 */1620
1621 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1622 {1623
1624 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1625 th->seq = htonl(sk->write_seq);
1626 th->psh =(push == 0) ? 1 : 0;
1627 th->doff = sizeof(*th)/4;
1628 th->ack = 1;
1629 th->fin = 0;
1630 sk->ack_backlog = 0;
1631 sk->bytes_rcv = 0;
1632 sk->ack_timed = 0;
1633 th->ack_seq = htonl(sk->acked_seq);
1634 sk->window = tcp_select_window(sk);
1635 th->window = htons(sk->window);
1636
1637 return(sizeof(*th));
1638 }1639
1640 /*1641 * This routine copies from a user buffer into a socket,1642 * and starts the transmit system.1643 */1644
1645 staticinttcp_write(structsock *sk, constunsignedchar *from,
/* */1646 intlen, intnonblock, unsignedflags)
1647 {1648 intcopied = 0;
1649 intcopy;
1650 inttmp;
1651 structsk_buff *skb;
1652 structsk_buff *send_tmp;
1653 structproto *prot;
1654 structdevice *dev = NULL;
1655
1656 sk->inuse=1;
1657 prot = sk->prot;
1658 while(len > 0)
1659 {1660 if (sk->err)
1661 {/* Stop on an error */1662 release_sock(sk);
1663 if (copied)
1664 return(copied);
1665 tmp = -sk->err;
1666 sk->err = 0;
1667 return(tmp);
1668 }1669
1670 /*1671 * First thing we do is make sure that we are established. 1672 */1673
1674 if (sk->shutdown & SEND_SHUTDOWN)
1675 {1676 release_sock(sk);
1677 sk->err = EPIPE;
1678 if (copied)
1679 return(copied);
1680 sk->err = 0;
1681 return(-EPIPE);
1682 }1683
1684 /* 1685 * Wait for a connection to finish.1686 */1687
1688 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1689 {1690 if (sk->err)
1691 {1692 release_sock(sk);
1693 if (copied)
1694 return(copied);
1695 tmp = -sk->err;
1696 sk->err = 0;
1697 return(tmp);
1698 }1699
1700 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1701 {1702 release_sock(sk);
1703 if (copied)
1704 return(copied);
1705
1706 if (sk->err)
1707 {1708 tmp = -sk->err;
1709 sk->err = 0;
1710 return(tmp);
1711 }1712
1713 if (sk->keepopen)
1714 {1715 send_sig(SIGPIPE, current, 0);
1716 }1717 return(-EPIPE);
1718 }1719
1720 if (nonblock || copied)
1721 {1722 release_sock(sk);
1723 if (copied)
1724 return(copied);
1725 return(-EAGAIN);
1726 }1727
1728 release_sock(sk);
1729 cli();
1730
1731 if (sk->state != TCP_ESTABLISHED &&
1732 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1733 {1734 interruptible_sleep_on(sk->sleep);
1735 if (current->signal & ~current->blocked)
1736 {1737 sti();
1738 if (copied)
1739 return(copied);
1740 return(-ERESTARTSYS);
1741 }1742 }1743 sk->inuse = 1;
1744 sti();
1745 }1746
1747 /*1748 * The following code can result in copy <= if sk->mss is ever1749 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1750 * sk->mtu is constant once SYN processing is finished. I.e. we1751 * had better not get here until we've seen his SYN and at least one1752 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1753 * But ESTABLISHED should guarantee that. sk->max_window is by definition1754 * non-decreasing. Note that any ioctl to set user_mss must be done1755 * before the exchange of SYN's. If the initial ack from the other1756 * end has a window of 0, max_window and thus mss will both be 0.1757 */1758
1759 /* 1760 * Now we need to check if we have a half built packet. 1761 */1762
1763 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1764 {1765 inthdrlen;
1766
1767 /* IP header + TCP header */1768 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1769 + sizeof(structtcphdr);
1770
1771 /* Add more stuff to the end of skb->len */1772 if (!(flags & MSG_OOB))
1773 {1774 copy = min(sk->mss - (skb->len - hdrlen), len);
1775 /* FIXME: this is really a bug. */1776 if (copy <= 0)
1777 {1778 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1779 copy = 0;
1780 }1781
1782 memcpy_fromfs(skb_put(skb,copy), from, copy);
1783 from += copy;
1784 copied += copy;
1785 len -= copy;
1786 sk->write_seq += copy;
1787 }1788 if ((skb->len - hdrlen) >= sk->mss ||
1789 (flags & MSG_OOB) || !sk->packets_out)
1790 tcp_send_skb(sk, skb);
1791 else1792 tcp_enqueue_partial(skb, sk);
1793 continue;
1794 }1795
1796 /*1797 * We also need to worry about the window.1798 * If window < 1/2 the maximum window we've seen from this1799 * host, don't use it. This is sender side1800 * silly window prevention, as specified in RFC1122.1801 * (Note that this is different than earlier versions of1802 * SWS prevention, e.g. RFC813.). What we actually do is 1803 * use the whole MSS. Since the results in the right1804 * edge of the packet being outside the window, it will1805 * be queued for later rather than sent.1806 */1807
1808 copy = sk->window_seq - sk->write_seq;
1809 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1810 copy = sk->mss;
1811 if (copy > len)
1812 copy = len;
1813
1814 /*1815 * We should really check the window here also. 1816 */1817
1818 send_tmp = NULL;
1819 if (copy < sk->mss && !(flags & MSG_OOB))
1820 {1821 /*1822 * We will release the socket in case we sleep here. 1823 */1824 release_sock(sk);
1825 /*1826 * NB: following must be mtu, because mss can be increased.1827 * mss is always <= mtu 1828 */1829 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1830 sk->inuse = 1;
1831 send_tmp = skb;
1832 }1833 else1834 {1835 /*1836 * We will release the socket in case we sleep here. 1837 */1838 release_sock(sk);
1839 skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1840 sk->inuse = 1;
1841 }1842
1843 /*1844 * If we didn't get any memory, we need to sleep. 1845 */1846
1847 if (skb == NULL)
1848 {1849 sk->socket->flags |= SO_NOSPACE;
1850 if (nonblock)
1851 {1852 release_sock(sk);
1853 if (copied)
1854 return(copied);
1855 return(-EAGAIN);
1856 }1857
1858 /*1859 * FIXME: here is another race condition. 1860 */1861
1862 tmp = sk->wmem_alloc;
1863 release_sock(sk);
1864 cli();
1865 /*1866 * Again we will try to avoid it. 1867 */1868 if (tmp <= sk->wmem_alloc &&
1869 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1870 && sk->err == 0)
1871 {1872 sk->socket->flags &= ~SO_NOSPACE;
1873 interruptible_sleep_on(sk->sleep);
1874 if (current->signal & ~current->blocked)
1875 {1876 sti();
1877 if (copied)
1878 return(copied);
1879 return(-ERESTARTSYS);
1880 }1881 }1882 sk->inuse = 1;
1883 sti();
1884 continue;
1885 }1886
1887 skb->sk = sk;
1888 skb->free = 0;
1889 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1890
1891 /*1892 * FIXME: we need to optimize this.1893 * Perhaps some hints here would be good.1894 */1895
1896 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1897 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1898 if (tmp < 0 )
1899 {1900 prot->wfree(sk, skb);
1901 release_sock(sk);
1902 if (copied)
1903 return(copied);
1904 return(tmp);
1905 }1906 skb->dev = dev;
1907 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
1908 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1909 if (tmp < 0)
1910 {1911 prot->wfree(sk, skb);
1912 release_sock(sk);
1913 if (copied)
1914 return(copied);
1915 return(tmp);
1916 }1917
1918 if (flags & MSG_OOB)
1919 {1920 skb->h.th->urg = 1;
1921 skb->h.th->urg_ptr = ntohs(copy);
1922 }1923
1924 memcpy_fromfs(skb_put(skb,copy), from, copy);
1925
1926 from += copy;
1927 copied += copy;
1928 len -= copy;
1929 skb->free = 0;
1930 sk->write_seq += copy;
1931
1932 if (send_tmp != NULL && sk->packets_out)
1933 {1934 tcp_enqueue_partial(send_tmp, sk);
1935 continue;
1936 }1937 tcp_send_skb(sk, skb);
1938 }1939 sk->err = 0;
1940
1941 /*1942 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1943 * interactive fast network servers. It's meant to be on and1944 * it really improves the throughput though not the echo time1945 * on my slow slip link - Alan1946 */1947
1948 /*1949 * Avoid possible race on send_tmp - c/o Johannes Stille 1950 */1951
1952 if(sk->partial && ((!sk->packets_out)
1953 /* If not nagling we can send on the before case too.. */1954 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1955 ))
1956 tcp_send_partial(sk);
1957
1958 release_sock(sk);
1959 return(copied);
1960 }1961
1962 /*1963 * This is just a wrapper. 1964 */1965
1966 staticinttcp_sendto(structsock *sk, constunsignedchar *from,
/* */1967 intlen, intnonblock, unsignedflags,
1968 structsockaddr_in *addr, intaddr_len)
1969 {1970 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1971 return -EINVAL;
1972 if (sk->state == TCP_CLOSE)
1973 return -ENOTCONN;
1974 if (addr_len < sizeof(*addr))
1975 return -EINVAL;
1976 if (addr->sin_family && addr->sin_family != AF_INET)
1977 return -EINVAL;
1978 if (addr->sin_port != sk->dummy_th.dest)
1979 return -EISCONN;
1980 if (addr->sin_addr.s_addr != sk->daddr)
1981 return -EISCONN;
1982 returntcp_write(sk, from, len, nonblock, flags);
1983 }1984
1985
1986 /*1987 * Send an ack if one is backlogged at this point. Ought to merge1988 * this with tcp_send_ack().1989 */1990
1991 staticvoidtcp_read_wakeup(structsock *sk)
/* */1992 {1993 inttmp;
1994 structdevice *dev = NULL;
1995 structtcphdr *t1;
1996 structsk_buff *buff;
1997
1998 if (!sk->ack_backlog)
1999 return;
2000
2001 /*2002 * If we're closed, don't send an ack, or we'll get a RST2003 * from the closed destination.2004 */2005 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2006 return;
2007
2008 /*2009 * FIXME: we need to put code here to prevent this routine from2010 * being called. Being called once in a while is ok, so only check2011 * if this is the second time in a row.2012 */2013
2014 /*2015 * We need to grab some memory, and put together an ack,2016 * and then put it into the queue to be sent.2017 */2018
2019 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2020 if (buff == NULL)
2021 {2022 /* Try again real soon. */2023 reset_xmit_timer(sk, TIME_WRITE, HZ);
2024 return;
2025 }2026
2027 buff->sk = sk;
2028 buff->localroute = sk->localroute;
2029
2030 /*2031 * Put in the IP header and routing stuff. 2032 */2033
2034 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2035 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2036 if (tmp < 0)
2037 {2038 buff->free = 1;
2039 sk->prot->wfree(sk, buff);
2040 return;
2041 }2042
2043 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2044
2045 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2046 t1->seq = htonl(sk->sent_seq);
2047 t1->ack = 1;
2048 t1->res1 = 0;
2049 t1->res2 = 0;
2050 t1->rst = 0;
2051 t1->urg = 0;
2052 t1->syn = 0;
2053 t1->psh = 0;
2054 sk->ack_backlog = 0;
2055 sk->bytes_rcv = 0;
2056 sk->window = tcp_select_window(sk);
2057 t1->window = ntohs(sk->window);
2058 t1->ack_seq = ntohl(sk->acked_seq);
2059 t1->doff = sizeof(*t1)/4;
2060 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2061 sk->prot->queue_xmit(sk, dev, buff, 1);
2062 tcp_statistics.TcpOutSegs++;
2063 }2064
2065
2066 /*2067 * FIXME:2068 * This routine frees used buffers.2069 * It should consider sending an ACK to let the2070 * other end know we now have a bigger window.2071 */2072
2073 staticvoidcleanup_rbuf(structsock *sk)
/* */2074 {2075 unsignedlongflags;
2076 unsignedlongleft;
2077 structsk_buff *skb;
2078 unsignedlongrspace;
2079
2080 if(sk->debug)
2081 printk("cleaning rbuf for sk=%p\n", sk);
2082
2083 save_flags(flags);
2084 cli();
2085
2086 left = sk->prot->rspace(sk);
2087
2088 /*2089 * We have to loop through all the buffer headers,2090 * and try to free up all the space we can.2091 */2092
2093 while((skb=skb_peek(&sk->receive_queue)) != NULL)
2094 {2095 if (!skb->used || skb->users)
2096 break;
2097 skb_unlink(skb);
2098 skb->sk = sk;
2099 kfree_skb(skb, FREE_READ);
2100 }2101
2102 restore_flags(flags);
2103
2104 /*2105 * FIXME:2106 * At this point we should send an ack if the difference2107 * in the window, and the amount of space is bigger than2108 * TCP_WINDOW_DIFF.2109 */2110
2111 if(sk->debug)
2112 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
2113 left);
2114 if ((rspace=sk->prot->rspace(sk)) != left)
2115 {2116 /*2117 * This area has caused the most trouble. The current strategy2118 * is to simply do nothing if the other end has room to send at2119 * least 3 full packets, because the ack from those will auto-2120 * matically update the window. If the other end doesn't think2121 * we have much space left, but we have room for at least 1 more2122 * complete packet than it thinks we do, we will send an ack2123 * immediately. Otherwise we will wait up to .5 seconds in case2124 * the user reads some more.2125 */2126 sk->ack_backlog++;
2127 /*2128 * It's unclear whether to use sk->mtu or sk->mss here. They differ only2129 * if the other end is offering a window smaller than the agreed on MSS2130 * (called sk->mtu here). In theory there's no connection between send2131 * and receive, and so no reason to think that they're going to send2132 * small packets. For the moment I'm using the hack of reducing the mss2133 * only on the send side, so I'm putting mtu here.2134 */2135
2136 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2137 {2138 /* Send an ack right now. */2139 tcp_read_wakeup(sk);
2140 }2141 else2142 {2143 /* Force it to send an ack soon. */2144 intwas_active = del_timer(&sk->retransmit_timer);
2145 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2146 {2147 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2148 }2149 else2150 add_timer(&sk->retransmit_timer);
2151 }2152 }2153 }2154
2155
2156 /*2157 * Handle reading urgent data. BSD has very simple semantics for2158 * this, no blocking and very strange errors 8)2159 */2160
2161 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */2162 unsignedchar *to, intlen, unsignedflags)
2163 {2164 /*2165 * No URG data to read2166 */2167 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2168 return -EINVAL; /* Yes this is right ! */2169
2170 if (sk->err)
2171 {2172 inttmp = -sk->err;
2173 sk->err = 0;
2174 returntmp;
2175 }2176
2177 if (sk->state == TCP_CLOSE || sk->done)
2178 {2179 if (!sk->done) {2180 sk->done = 1;
2181 return 0;
2182 }2183 return -ENOTCONN;
2184 }2185
2186 if (sk->shutdown & RCV_SHUTDOWN)
2187 {2188 sk->done = 1;
2189 return 0;
2190 }2191 sk->inuse = 1;
2192 if (sk->urg_data & URG_VALID)
2193 {2194 charc = sk->urg_data;
2195 if (!(flags & MSG_PEEK))
2196 sk->urg_data = URG_READ;
2197 put_fs_byte(c, to);
2198 release_sock(sk);
2199 return 1;
2200 }2201 release_sock(sk);
2202
2203 /*2204 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2205 * the available implementations agree in this case:2206 * this call should never block, independent of the2207 * blocking state of the socket.2208 * Mike <pall@rz.uni-karlsruhe.de>2209 */2210 return -EAGAIN;
2211 }2212
2213
2214 /*2215 * This routine copies from a sock struct into the user buffer. 2216 */2217
2218 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */2219 intlen, intnonblock, unsignedflags)
2220 {2221 structwait_queuewait = {current, NULL};
2222 intcopied = 0;
2223 u32peek_seq;
2224 volatileu32 *seq; /* So gcc doesn't overoptimise */2225 unsignedlongused;
2226
2227 /* 2228 * This error should be checked. 2229 */2230
2231 if (sk->state == TCP_LISTEN)
2232 return -ENOTCONN;
2233
2234 /*2235 * Urgent data needs to be handled specially. 2236 */2237
2238 if (flags & MSG_OOB)
2239 returntcp_read_urg(sk, nonblock, to, len, flags);
2240
2241 /*2242 * Copying sequence to update. This is volatile to handle2243 * the multi-reader case neatly (memcpy_to/fromfs might be 2244 * inline and thus not flush cached variables otherwise).2245 */2246
2247 peek_seq = sk->copied_seq;
2248 seq = &sk->copied_seq;
2249 if (flags & MSG_PEEK)
2250 seq = &peek_seq;
2251
2252 add_wait_queue(sk->sleep, &wait);
2253 sk->inuse = 1;
2254 while (len > 0)
2255 {2256 structsk_buff * skb;
2257 u32offset;
2258
2259 /*2260 * Are we at urgent data? Stop if we have read anything.2261 */2262
2263 if (copied && sk->urg_data && sk->urg_seq == *seq)
2264 break;
2265
2266 /*2267 * Next get a buffer.2268 */2269
2270 current->state = TASK_INTERRUPTIBLE;
2271
2272 skb = skb_peek(&sk->receive_queue);
2273 do2274 {2275 if (!skb)
2276 break;
2277 if (before(*seq, skb->h.th->seq))
2278 break;
2279 offset = *seq - skb->h.th->seq;
2280 if (skb->h.th->syn)
2281 offset--;
2282 if (offset < skb->len)
2283 gotofound_ok_skb;
2284 if (skb->h.th->fin)
2285 gotofound_fin_ok;
2286 if (!(flags & MSG_PEEK))
2287 skb->used = 1;
2288 skb = skb->next;
2289 }2290 while (skb != (structsk_buff *)&sk->receive_queue);
2291
2292 if (copied)
2293 break;
2294
2295 if (sk->err)
2296 {2297 copied = -sk->err;
2298 sk->err = 0;
2299 break;
2300 }2301
2302 if (sk->state == TCP_CLOSE)
2303 {2304 if (!sk->done)
2305 {2306 sk->done = 1;
2307 break;
2308 }2309 copied = -ENOTCONN;
2310 break;
2311 }2312
2313 if (sk->shutdown & RCV_SHUTDOWN)
2314 {2315 sk->done = 1;
2316 break;
2317 }2318
2319 if (nonblock)
2320 {2321 copied = -EAGAIN;
2322 break;
2323 }2324
2325 cleanup_rbuf(sk);
2326 release_sock(sk);
2327 sk->socket->flags |= SO_WAITDATA;
2328 schedule();
2329 sk->socket->flags &= ~SO_WAITDATA;
2330 sk->inuse = 1;
2331
2332 if (current->signal & ~current->blocked)
2333 {2334 copied = -ERESTARTSYS;
2335 break;
2336 }2337 continue;
2338
2339 found_ok_skb:
2340 /*2341 * Lock the buffer. We can be fairly relaxed as2342 * an interrupt will never steal a buffer we are 2343 * using unless I've missed something serious in2344 * tcp_data.2345 */2346
2347 skb->users++;
2348
2349 /*2350 * Ok so how much can we use ? 2351 */2352
2353 used = skb->len - offset;
2354 if (len < used)
2355 used = len;
2356 /*2357 * Do we have urgent data here? 2358 */2359
2360 if (sk->urg_data)
2361 {2362 u32urg_offset = sk->urg_seq - *seq;
2363 if (urg_offset < used)
2364 {2365 if (!urg_offset)
2366 {2367 if (!sk->urginline)
2368 {2369 ++*seq;
2370 offset++;
2371 used--;
2372 }2373 }2374 else2375 used = urg_offset;
2376 }2377 }2378
2379 /*2380 * Copy it - We _MUST_ update *seq first so that we2381 * don't ever double read when we have dual readers2382 */2383
2384 *seq += used;
2385
2386 /*2387 * This memcpy_tofs can sleep. If it sleeps and we2388 * do a second read it relies on the skb->users to avoid2389 * a crash when cleanup_rbuf() gets called.2390 */2391
2392 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2393 skb->h.th->doff*4 + offset, used);
2394 copied += used;
2395 len -= used;
2396 to += used;
2397
2398 /*2399 * We now will not sleep again until we are finished2400 * with skb. Sorry if you are doing the SMP port2401 * but you'll just have to fix it neatly ;)2402 */2403
2404 skb->users --;
2405
2406 if (after(sk->copied_seq,sk->urg_seq))
2407 sk->urg_data = 0;
2408 if (used + offset < skb->len)
2409 continue;
2410
2411 /*2412 * Process the FIN.2413 */2414
2415 if (skb->h.th->fin)
2416 gotofound_fin_ok;
2417 if (flags & MSG_PEEK)
2418 continue;
2419 skb->used = 1;
2420 continue;
2421
2422 found_fin_ok:
2423 ++*seq;
2424 if (flags & MSG_PEEK)
2425 break;
2426
2427 /*2428 * All is done2429 */2430
2431 skb->used = 1;
2432 sk->shutdown |= RCV_SHUTDOWN;
2433 break;
2434
2435 }2436 remove_wait_queue(sk->sleep, &wait);
2437 current->state = TASK_RUNNING;
2438
2439 /* Clean up data we have read: This will do ACK frames */2440 cleanup_rbuf(sk);
2441 release_sock(sk);
2442 returncopied;
2443 }2444
2445 /*2446 * State processing on a close. This implements the state shift for2447 * sending our FIN frame. Note that we only send a FIN for some 2448 * states. A shutdown() may have already sent the FIN, or we may be2449 * closed.2450 */2451
2452 staticinttcp_close_state(structsock *sk, intdead)
/* */2453 {2454 intns=TCP_CLOSE;
2455 intsend_fin=0;
2456 switch(sk->state)
2457 {2458 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2459 break;
2460 caseTCP_SYN_RECV:
2461 caseTCP_ESTABLISHED: /* Closedown begin */2462 ns=TCP_FIN_WAIT1;
2463 send_fin=1;
2464 break;
2465 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2466 caseTCP_FIN_WAIT2:
2467 caseTCP_CLOSING:
2468 ns=sk->state;
2469 break;
2470 caseTCP_CLOSE:
2471 caseTCP_LISTEN:
2472 break;
2473 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2474 wait only for the ACK */2475 ns=TCP_LAST_ACK;
2476 send_fin=1;
2477 }2478
2479 tcp_set_state(sk,ns);
2480
2481 /*2482 * This is a (useful) BSD violating of the RFC. There is a2483 * problem with TCP as specified in that the other end could2484 * keep a socket open forever with no application left this end.2485 * We use a 3 minute timeout (about the same as BSD) then kill2486 * our end. If they send after that then tough - BUT: long enough2487 * that we won't make the old 4*rto = almost no time - whoops2488 * reset mistake.2489 */2490 if(dead && ns==TCP_FIN_WAIT2)
2491 {2492 inttimer_active=del_timer(&sk->timer);
2493 if(timer_active)
2494 add_timer(&sk->timer);
2495 else2496 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2497 }2498
2499 returnsend_fin;
2500 }2501
2502 /*2503 * Send a fin.2504 */2505
2506 staticvoidtcp_send_fin(structsock *sk)
/* */2507 {2508 structproto *prot =(structproto *)sk->prot;
2509 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2510 structtcphdr *t1;
2511 structsk_buff *buff;
2512 structdevice *dev=NULL;
2513 inttmp;
2514
2515 release_sock(sk); /* in case the malloc sleeps. */2516
2517 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2518 sk->inuse = 1;
2519
2520 if (buff == NULL)
2521 {2522 /* This is a disaster if it occurs */2523 printk("tcp_send_fin: Impossible malloc failure");
2524 return;
2525 }2526
2527 /*2528 * Administrivia2529 */2530
2531 buff->sk = sk;
2532 buff->localroute = sk->localroute;
2533
2534 /*2535 * Put in the IP header and routing stuff. 2536 */2537
2538 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2539 IPPROTO_TCP, sk->opt,
2540 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2541 if (tmp < 0)
2542 {2543 intt;
2544 /*2545 * Finish anyway, treat this as a send that got lost. 2546 * (Not good).2547 */2548
2549 buff->free = 1;
2550 prot->wfree(sk,buff);
2551 sk->write_seq++;
2552 t=del_timer(&sk->timer);
2553 if(t)
2554 add_timer(&sk->timer);
2555 else2556 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2557 return;
2558 }2559
2560 /*2561 * We ought to check if the end of the queue is a buffer and2562 * if so simply add the fin to that buffer, not send it ahead.2563 */2564
2565 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2566 buff->dev = dev;
2567 memcpy(t1, th, sizeof(*t1));
2568 t1->seq = ntohl(sk->write_seq);
2569 sk->write_seq++;
2570 buff->h.seq = sk->write_seq;
2571 t1->ack = 1;
2572 t1->ack_seq = ntohl(sk->acked_seq);
2573 t1->window = ntohs(sk->window=tcp_select_window(sk));
2574 t1->fin = 1;
2575 t1->rst = 0;
2576 t1->doff = sizeof(*t1)/4;
2577 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2578
2579 /*2580 * If there is data in the write queue, the fin must be appended to2581 * the write queue.2582 */2583
2584 if (skb_peek(&sk->write_queue) != NULL)
2585 {2586 buff->free = 0;
2587 if (buff->next != NULL)
2588 {2589 printk("tcp_send_fin: next != NULL\n");
2590 skb_unlink(buff);
2591 }2592 skb_queue_tail(&sk->write_queue, buff);
2593 }2594 else2595 {2596 sk->sent_seq = sk->write_seq;
2597 sk->prot->queue_xmit(sk, dev, buff, 0);
2598 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2599 }2600 }2601
2602 /*2603 * Shutdown the sending side of a connection. Much like close except2604 * that we don't receive shut down or set sk->dead=1.2605 */2606
2607 voidtcp_shutdown(structsock *sk, inthow)
/* */2608 {2609 /*2610 * We need to grab some memory, and put together a FIN,2611 * and then put it into the queue to be sent.2612 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2613 */2614
2615 if (!(how & SEND_SHUTDOWN))
2616 return;
2617
2618 /*2619 * If we've already sent a FIN, or it's a closed state2620 */2621
2622 if (sk->state == TCP_FIN_WAIT1 ||
2623 sk->state == TCP_FIN_WAIT2 ||
2624 sk->state == TCP_CLOSING ||
2625 sk->state == TCP_LAST_ACK ||
2626 sk->state == TCP_TIME_WAIT ||
2627 sk->state == TCP_CLOSE ||
2628 sk->state == TCP_LISTEN2629 )
2630 {2631 return;
2632 }2633 sk->inuse = 1;
2634
2635 /*2636 * flag that the sender has shutdown2637 */2638
2639 sk->shutdown |= SEND_SHUTDOWN;
2640
2641 /*2642 * Clear out any half completed packets. 2643 */2644
2645 if (sk->partial)
2646 tcp_send_partial(sk);
2647
2648 /*2649 * FIN if needed2650 */2651
2652 if(tcp_close_state(sk,0))
2653 tcp_send_fin(sk);
2654
2655 release_sock(sk);
2656 }2657
2658
2659 staticint2660 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2661 intto_len, intnonblock, unsignedflags,
2662 structsockaddr_in *addr, int *addr_len)
2663 {2664 intresult;
2665
2666 /* 2667 * Have to check these first unlike the old code. If 2668 * we check them after we lose data on an error2669 * which is wrong 2670 */2671
2672 if(addr_len)
2673 *addr_len = sizeof(*addr);
2674 result=tcp_read(sk, to, to_len, nonblock, flags);
2675
2676 if (result < 0)
2677 return(result);
2678
2679 if(addr)
2680 {2681 addr->sin_family = AF_INET;
2682 addr->sin_port = sk->dummy_th.dest;
2683 addr->sin_addr.s_addr = sk->daddr;
2684 }2685 return(result);
2686 }2687
2688
2689 /*2690 * This routine will send an RST to the other tcp. 2691 */2692
2693 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2694 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2695 {2696 structsk_buff *buff;
2697 structtcphdr *t1;
2698 inttmp;
2699 structdevice *ndev=NULL;
2700
2701 /*2702 * Cannot reset a reset (Think about it).2703 */2704
2705 if(th->rst)
2706 return;
2707
2708 /*2709 * We need to grab some memory, and put together an RST,2710 * and then put it into the queue to be sent.2711 */2712
2713 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2714 if (buff == NULL)
2715 return;
2716
2717 buff->sk = NULL;
2718 buff->dev = dev;
2719 buff->localroute = 0;
2720
2721 /*2722 * Put in the IP header and routing stuff. 2723 */2724
2725 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2726 sizeof(structtcphdr),tos,ttl);
2727 if (tmp < 0)
2728 {2729 buff->free = 1;
2730 prot->wfree(NULL, buff);
2731 return;
2732 }2733
2734 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2735 memcpy(t1, th, sizeof(*t1));
2736
2737 /*2738 * Swap the send and the receive. 2739 */2740
2741 t1->dest = th->source;
2742 t1->source = th->dest;
2743 t1->rst = 1;
2744 t1->window = 0;
2745
2746 if(th->ack)
2747 {2748 t1->ack = 0;
2749 t1->seq = th->ack_seq;
2750 t1->ack_seq = 0;
2751 }2752 else2753 {2754 t1->ack = 1;
2755 if(!th->syn)
2756 t1->ack_seq=htonl(th->seq);
2757 else2758 t1->ack_seq=htonl(th->seq+1);
2759 t1->seq=0;
2760 }2761
2762 t1->syn = 0;
2763 t1->urg = 0;
2764 t1->fin = 0;
2765 t1->psh = 0;
2766 t1->doff = sizeof(*t1)/4;
2767 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2768 prot->queue_xmit(NULL, ndev, buff, 1);
2769 tcp_statistics.TcpOutSegs++;
2770 }2771
2772
2773 /*2774 * Look for tcp options. Parses everything but only knows about MSS.2775 * This routine is always called with the packet containing the SYN.2776 * However it may also be called with the ack to the SYN. So you2777 * can't assume this is always the SYN. It's always called after2778 * we have set up sk->mtu to our own MTU.2779 *2780 * We need at minimum to add PAWS support here. Possibly large windows2781 * as Linux gets deployed on 100Mb/sec networks.2782 */2783
2784 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2785 {2786 unsignedchar *ptr;
2787 intlength=(th->doff*4)-sizeof(structtcphdr);
2788 intmss_seen = 0;
2789
2790 ptr = (unsignedchar *)(th + 1);
2791
2792 while(length>0)
2793 {2794 intopcode=*ptr++;
2795 intopsize=*ptr++;
2796 switch(opcode)
2797 {2798 caseTCPOPT_EOL:
2799 return;
2800 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2801 length--;
2802 ptr--; /* the opsize=*ptr++ above was a mistake */2803 continue;
2804
2805 default:
2806 if(opsize<=2) /* Avoid silly options looping forever */2807 return;
2808 switch(opcode)
2809 {2810 caseTCPOPT_MSS:
2811 if(opsize==4 && th->syn)
2812 {2813 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2814 mss_seen = 1;
2815 }2816 break;
2817 /* Add other options here as people feel the urge to implement stuff like large windows */2818 }2819 ptr+=opsize-2;
2820 length-=opsize;
2821 }2822 }2823 if (th->syn)
2824 {2825 if (! mss_seen)
2826 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2827 }2828 #ifdefCONFIG_INET_PCTCP2829 sk->mss = min(sk->max_window >> 1, sk->mtu);
2830 #else2831 sk->mss = min(sk->max_window, sk->mtu);
2832 #endif2833 }2834
2835 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2836 {2837 dst = ntohl(dst);
2838 if (IN_CLASSA(dst))
2839 returnhtonl(IN_CLASSA_NET);
2840 if (IN_CLASSB(dst))
2841 returnhtonl(IN_CLASSB_NET);
2842 returnhtonl(IN_CLASSC_NET);
2843 }2844
2845 /*2846 * Default sequence number picking algorithm.2847 * As close as possible to RFC 793, which2848 * suggests using a 250kHz clock.2849 * Further reading shows this assumes 2MB/s networks.2850 * For 10MB/s ethernet, a 1MHz clock is appropriate.2851 * That's funny, Linux has one built in! Use it!2852 */2853
2854 externinlineu32tcp_init_seq(void)
/* */2855 {2856 structtimevaltv;
2857 do_gettimeofday(&tv);
2858 returntv.tv_usec+tv.tv_sec*1000000;
2859 }2860
2861 /*2862 * This routine handles a connection request.2863 * It should make sure we haven't already responded.2864 * Because of the way BSD works, we have to send a syn/ack now.2865 * This also means it will be harder to close a socket which is2866 * listening.2867 */2868
2869 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2870 unsignedlongdaddr, unsignedlongsaddr,
2871 structoptions *opt, structdevice *dev, u32seq)
2872 {2873 structsk_buff *buff;
2874 structtcphdr *t1;
2875 unsignedchar *ptr;
2876 structsock *newsk;
2877 structtcphdr *th;
2878 structdevice *ndev=NULL;
2879 inttmp;
2880 structrtable *rt;
2881
2882 th = skb->h.th;
2883
2884 /* If the socket is dead, don't accept the connection. */2885 if (!sk->dead)
2886 {2887 sk->data_ready(sk,0);
2888 }2889 else2890 {2891 if(sk->debug)
2892 printk("Reset on %p: Connect on dead socket.\n",sk);
2893 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2894 tcp_statistics.TcpAttemptFails++;
2895 kfree_skb(skb, FREE_READ);
2896 return;
2897 }2898
2899 /*2900 * Make sure we can accept more. This will prevent a2901 * flurry of syns from eating up all our memory.2902 */2903
2904 if (sk->ack_backlog >= sk->max_ack_backlog)
2905 {2906 tcp_statistics.TcpAttemptFails++;
2907 kfree_skb(skb, FREE_READ);
2908 return;
2909 }2910
2911 /*2912 * We need to build a new sock struct.2913 * It is sort of bad to have a socket without an inode attached2914 * to it, but the wake_up's will just wake up the listening socket,2915 * and if the listening socket is destroyed before this is taken2916 * off of the queue, this will take care of it.2917 */2918
2919 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2920 if (newsk == NULL)
2921 {2922 /* just ignore the syn. It will get retransmitted. */2923 tcp_statistics.TcpAttemptFails++;
2924 kfree_skb(skb, FREE_READ);
2925 return;
2926 }2927
2928 memcpy(newsk, sk, sizeof(*newsk));
2929 newsk->opt = NULL;
2930 if (opt && opt->optlen) {2931 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
2932 if (!sk->opt) {2933 kfree_s(newsk, sizeof(structsock));
2934 tcp_statistics.TcpAttemptFails++;
2935 kfree_skb(skb, FREE_READ);
2936 return;
2937 }2938 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {2939 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
2940 kfree_s(newsk, sizeof(structsock));
2941 tcp_statistics.TcpAttemptFails++;
2942 kfree_skb(skb, FREE_READ);
2943 return;
2944 }2945 }2946 skb_queue_head_init(&newsk->write_queue);
2947 skb_queue_head_init(&newsk->receive_queue);
2948 newsk->send_head = NULL;
2949 newsk->send_tail = NULL;
2950 skb_queue_head_init(&newsk->back_log);
2951 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2952 newsk->rto = TCP_TIMEOUT_INIT;
2953 newsk->mdev = 0;
2954 newsk->max_window = 0;
2955 newsk->cong_window = 1;
2956 newsk->cong_count = 0;
2957 newsk->ssthresh = 0;
2958 newsk->backoff = 0;
2959 newsk->blog = 0;
2960 newsk->intr = 0;
2961 newsk->proc = 0;
2962 newsk->done = 0;
2963 newsk->partial = NULL;
2964 newsk->pair = NULL;
2965 newsk->wmem_alloc = 0;
2966 newsk->rmem_alloc = 0;
2967 newsk->localroute = sk->localroute;
2968
2969 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2970
2971 newsk->err = 0;
2972 newsk->shutdown = 0;
2973 newsk->ack_backlog = 0;
2974 newsk->acked_seq = skb->h.th->seq+1;
2975 newsk->copied_seq = skb->h.th->seq+1;
2976 newsk->fin_seq = skb->h.th->seq;
2977 newsk->state = TCP_SYN_RECV;
2978 newsk->timeout = 0;
2979 newsk->ip_xmit_timeout = 0;
2980 newsk->write_seq = seq;
2981 newsk->window_seq = newsk->write_seq;
2982 newsk->rcv_ack_seq = newsk->write_seq;
2983 newsk->urg_data = 0;
2984 newsk->retransmits = 0;
2985 newsk->linger=0;
2986 newsk->destroy = 0;
2987 init_timer(&newsk->timer);
2988 newsk->timer.data = (unsignedlong)newsk;
2989 newsk->timer.function = &net_timer;
2990 init_timer(&newsk->retransmit_timer);
2991 newsk->retransmit_timer.data = (unsignedlong)newsk;
2992 newsk->retransmit_timer.function=&retransmit_timer;
2993 newsk->dummy_th.source = skb->h.th->dest;
2994 newsk->dummy_th.dest = skb->h.th->source;
2995
2996 /*2997 * Swap these two, they are from our point of view. 2998 */2999
3000 newsk->daddr = saddr;
3001 newsk->saddr = daddr;
3002
3003 put_sock(newsk->num,newsk);
3004 newsk->dummy_th.res1 = 0;
3005 newsk->dummy_th.doff = 6;
3006 newsk->dummy_th.fin = 0;
3007 newsk->dummy_th.syn = 0;
3008 newsk->dummy_th.rst = 0;
3009 newsk->dummy_th.psh = 0;
3010 newsk->dummy_th.ack = 0;
3011 newsk->dummy_th.urg = 0;
3012 newsk->dummy_th.res2 = 0;
3013 newsk->acked_seq = skb->h.th->seq + 1;
3014 newsk->copied_seq = skb->h.th->seq + 1;
3015 newsk->socket = NULL;
3016
3017 /*3018 * Grab the ttl and tos values and use them 3019 */3020
3021 newsk->ip_ttl=sk->ip_ttl;
3022 newsk->ip_tos=skb->ip_hdr->tos;
3023
3024 /*3025 * Use 512 or whatever user asked for 3026 */3027
3028 /*3029 * Note use of sk->user_mss, since user has no direct access to newsk 3030 */3031
3032 rt=ip_rt_route(saddr, NULL,NULL);
3033
3034 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3035 newsk->window_clamp = rt->rt_window;
3036 else3037 newsk->window_clamp = 0;
3038
3039 if (sk->user_mss)
3040 newsk->mtu = sk->user_mss;
3041 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
3042 newsk->mtu = rt->rt_mss - sizeof(structiphdr) - sizeof(structtcphdr);
3043 else3044 {3045 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */3046 if ((saddr ^ daddr) & default_mask(saddr))
3047 #else3048 if ((saddr ^ daddr) & dev->pa_mask)
3049 #endif3050 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
3051 else3052 newsk->mtu = MAX_WINDOW;
3053 }3054
3055 /*3056 * But not bigger than device MTU 3057 */3058
3059 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
3060
3061 /*3062 * This will min with what arrived in the packet 3063 */3064
3065 tcp_options(newsk,skb->h.th);
3066
3067 tcp_cache_zap();
3068
3069 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3070 if (buff == NULL)
3071 {3072 sk->err = ENOMEM;
3073 newsk->dead = 1;
3074 newsk->state = TCP_CLOSE;
3075 /* And this will destroy it */3076 release_sock(newsk);
3077 kfree_skb(skb, FREE_READ);
3078 tcp_statistics.TcpAttemptFails++;
3079 return;
3080 }3081
3082 buff->sk = newsk;
3083 buff->localroute = newsk->localroute;
3084
3085 /*3086 * Put in the IP header and routing stuff. 3087 */3088
3089 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3090 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3091
3092 /*3093 * Something went wrong. 3094 */3095
3096 if (tmp < 0)
3097 {3098 sk->err = tmp;
3099 buff->free = 1;
3100 kfree_skb(buff,FREE_WRITE);
3101 newsk->dead = 1;
3102 newsk->state = TCP_CLOSE;
3103 release_sock(newsk);
3104 skb->sk = sk;
3105 kfree_skb(skb, FREE_READ);
3106 tcp_statistics.TcpAttemptFails++;
3107 return;
3108 }3109
3110 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
3111
3112 memcpy(t1, skb->h.th, sizeof(*t1));
3113 buff->h.seq = newsk->write_seq;
3114 /*3115 * Swap the send and the receive. 3116 */3117 t1->dest = skb->h.th->source;
3118 t1->source = newsk->dummy_th.source;
3119 t1->seq = ntohl(newsk->write_seq++);
3120 t1->ack = 1;
3121 newsk->window = tcp_select_window(newsk);
3122 newsk->sent_seq = newsk->write_seq;
3123 t1->window = ntohs(newsk->window);
3124 t1->res1 = 0;
3125 t1->res2 = 0;
3126 t1->rst = 0;
3127 t1->urg = 0;
3128 t1->psh = 0;
3129 t1->syn = 1;
3130 t1->ack_seq = ntohl(skb->h.th->seq+1);
3131 t1->doff = sizeof(*t1)/4+1;
3132 ptr = skb_put(buff,4);
3133 ptr[0] = 2;
3134 ptr[1] = 4;
3135 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3136 ptr[3] =(newsk->mtu) & 0xff;
3137
3138 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3139 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3140 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3141 skb->sk = newsk;
3142
3143 /*3144 * Charge the sock_buff to newsk. 3145 */3146
3147 sk->rmem_alloc -= skb->truesize;
3148 newsk->rmem_alloc += skb->truesize;
3149
3150 skb_queue_tail(&sk->receive_queue,skb);
3151 sk->ack_backlog++;
3152 release_sock(newsk);
3153 tcp_statistics.TcpOutSegs++;
3154 }3155
3156
3157 staticvoidtcp_close(structsock *sk, inttimeout)
/* */3158 {3159 /*3160 * We need to grab some memory, and put together a FIN, 3161 * and then put it into the queue to be sent.3162 */3163
3164 sk->inuse = 1;
3165
3166 if(th_cache_sk==sk)
3167 tcp_cache_zap();
3168 if(sk->state == TCP_LISTEN)
3169 {3170 /* Special case */3171 tcp_set_state(sk, TCP_CLOSE);
3172 tcp_close_pending(sk);
3173 release_sock(sk);
3174 return;
3175 }3176
3177 sk->keepopen = 1;
3178 sk->shutdown = SHUTDOWN_MASK;
3179
3180 if (!sk->dead)
3181 sk->state_change(sk);
3182
3183 if (timeout == 0)
3184 {3185 structsk_buff *skb;
3186
3187 /*3188 * We need to flush the recv. buffs. We do this only on the3189 * descriptor close, not protocol-sourced closes, because the3190 * reader process may not have drained the data yet!3191 */3192
3193 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3194 kfree_skb(skb, FREE_READ);
3195 /*3196 * Get rid off any half-completed packets. 3197 */3198
3199 if (sk->partial)
3200 tcp_send_partial(sk);
3201 }3202
3203
3204 /*3205 * Timeout is not the same thing - however the code likes3206 * to send both the same way (sigh).3207 */3208
3209 if(timeout)
3210 {3211 tcp_set_state(sk, TCP_CLOSE); /* Dead */3212 }3213 else3214 {3215 if(tcp_close_state(sk,1)==1)
3216 {3217 tcp_send_fin(sk);
3218 }3219 }3220 release_sock(sk);
3221 }3222
3223
3224 /*3225 * This routine takes stuff off of the write queue,3226 * and puts it in the xmit queue. This happens as incoming acks3227 * open up the remote window for us.3228 */3229
3230 staticvoidtcp_write_xmit(structsock *sk)
/* */3231 {3232 structsk_buff *skb;
3233
3234 /*3235 * The bytes will have to remain here. In time closedown will3236 * empty the write queue and all will be happy 3237 */3238
3239 if(sk->zapped)
3240 return;
3241
3242 /*3243 * Anything on the transmit queue that fits the window can3244 * be added providing we are not3245 *3246 * a) retransmitting (Nagle's rule)3247 * b) exceeding our congestion window.3248 */3249
3250 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3251 before(skb->h.seq, sk->window_seq + 1) &&
3252 (sk->retransmits == 0 ||
3253 sk->ip_xmit_timeout != TIME_WRITE ||
3254 before(skb->h.seq, sk->rcv_ack_seq + 1))
3255 && sk->packets_out < sk->cong_window)
3256 {3257 IS_SKB(skb);
3258 skb_unlink(skb);
3259
3260 /*3261 * See if we really need to send the packet. 3262 */3263
3264 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3265 {3266 /*3267 * This is acked data. We can discard it. This 3268 * cannot currently occur.3269 */3270
3271 sk->retransmits = 0;
3272 kfree_skb(skb, FREE_WRITE);
3273 if (!sk->dead)
3274 sk->write_space(sk);
3275 }3276 else3277 {3278 structtcphdr *th;
3279 structiphdr *iph;
3280 intsize;
3281 /*3282 * put in the ack seq and window at this point rather than earlier,3283 * in order to keep them monotonic. We really want to avoid taking3284 * back window allocations. That's legal, but RFC1122 says it's frowned on.3285 * Ack and window will in general have changed since this packet was put3286 * on the write queue.3287 */3288 iph = skb->ip_hdr;
3289 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3290 size = skb->len - (((unsignedchar *) th) - skb->data);
3291
3292 th->ack_seq = ntohl(sk->acked_seq);
3293 th->window = ntohs(tcp_select_window(sk));
3294
3295 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3296
3297 sk->sent_seq = skb->h.seq;
3298
3299 /*3300 * IP manages our queue for some crazy reason3301 */3302
3303 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3304
3305 /*3306 * Again we slide the timer wrongly3307 */3308
3309 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3310 }3311 }3312 }3313
3314
3315 /*3316 * This routine deals with incoming acks, but not outgoing ones.3317 */3318
3319 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3320 {3321 u32ack;
3322 intflag = 0;
3323
3324 /* 3325 * 1 - there was data in packet as well as ack or new data is sent or 3326 * in shutdown state3327 * 2 - data from retransmit queue was acked and removed3328 * 4 - window shrunk or data from retransmit queue was acked and removed3329 */3330
3331 if(sk->zapped)
3332 return(1); /* Dead, cant ack any more so why bother */3333
3334 /*3335 * Have we discovered a larger window3336 */3337
3338 ack = ntohl(th->ack_seq);
3339
3340 if (ntohs(th->window) > sk->max_window)
3341 {3342 sk->max_window = ntohs(th->window);
3343 #ifdefCONFIG_INET_PCTCP3344 /* Hack because we don't send partial packets to non SWS3345 handling hosts */3346 sk->mss = min(sk->max_window>>1, sk->mtu);
3347 #else3348 sk->mss = min(sk->max_window, sk->mtu);
3349 #endif3350 }3351
3352 /*3353 * We have dropped back to keepalive timeouts. Thus we have3354 * no retransmits pending.3355 */3356
3357 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3358 sk->retransmits = 0;
3359
3360 /*3361 * If the ack is newer than sent or older than previous acks3362 * then we can probably ignore it.3363 */3364
3365 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3366 {3367 if(sk->debug)
3368 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3369
3370 /*3371 * Keepalive processing.3372 */3373
3374 if (after(ack, sk->sent_seq))
3375 {3376 return(0);
3377 }3378
3379 /*3380 * Restart the keepalive timer.3381 */3382
3383 if (sk->keepopen)
3384 {3385 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3386 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3387 }3388 return(1);
3389 }3390
3391 /*3392 * If there is data set flag 13393 */3394
3395 if (len != th->doff*4)
3396 flag |= 1;
3397
3398 /*3399 * See if our window has been shrunk. 3400 */3401
3402 if (after(sk->window_seq, ack+ntohs(th->window)))
3403 {3404 /*3405 * We may need to move packets from the send queue3406 * to the write queue, if the window has been shrunk on us.3407 * The RFC says you are not allowed to shrink your window3408 * like this, but if the other end does, you must be able3409 * to deal with it.3410 */3411 structsk_buff *skb;
3412 structsk_buff *skb2;
3413 structsk_buff *wskb = NULL;
3414
3415 skb2 = sk->send_head;
3416 sk->send_head = NULL;
3417 sk->send_tail = NULL;
3418
3419 /*3420 * This is an artifact of a flawed concept. We want one3421 * queue and a smarter send routine when we send all.3422 */3423
3424 flag |= 4; /* Window changed */3425
3426 sk->window_seq = ack + ntohs(th->window);
3427 cli();
3428 while (skb2 != NULL)
3429 {3430 skb = skb2;
3431 skb2 = skb->link3;
3432 skb->link3 = NULL;
3433 if (after(skb->h.seq, sk->window_seq))
3434 {3435 if (sk->packets_out > 0)
3436 sk->packets_out--;
3437 /* We may need to remove this from the dev send list. */3438 if (skb->next != NULL)
3439 {3440 skb_unlink(skb);
3441 }3442 /* Now add it to the write_queue. */3443 if (wskb == NULL)
3444 skb_queue_head(&sk->write_queue,skb);
3445 else3446 skb_append(wskb,skb);
3447 wskb = skb;
3448 }3449 else3450 {3451 if (sk->send_head == NULL)
3452 {3453 sk->send_head = skb;
3454 sk->send_tail = skb;
3455 }3456 else3457 {3458 sk->send_tail->link3 = skb;
3459 sk->send_tail = skb;
3460 }3461 skb->link3 = NULL;
3462 }3463 }3464 sti();
3465 }3466
3467 /*3468 * Pipe has emptied3469 */3470
3471 if (sk->send_tail == NULL || sk->send_head == NULL)
3472 {3473 sk->send_head = NULL;
3474 sk->send_tail = NULL;
3475 sk->packets_out= 0;
3476 }3477
3478 /*3479 * Update the right hand window edge of the host3480 */3481
3482 sk->window_seq = ack + ntohs(th->window);
3483
3484 /*3485 * We don't want too many packets out there. 3486 */3487
3488 if (sk->ip_xmit_timeout == TIME_WRITE &&
3489 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3490 {3491 /* 3492 * This is Jacobson's slow start and congestion avoidance. 3493 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3494 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3495 * counter and increment it once every cwnd times. It's possible3496 * that this should be done only if sk->retransmits == 0. I'm3497 * interpreting "new data is acked" as including data that has3498 * been retransmitted but is just now being acked.3499 */3500 if (sk->cong_window < sk->ssthresh)
3501 /* 3502 * In "safe" area, increase3503 */3504 sk->cong_window++;
3505 else3506 {3507 /*3508 * In dangerous area, increase slowly. In theory this is3509 * sk->cong_window += 1 / sk->cong_window3510 */3511 if (sk->cong_count >= sk->cong_window)
3512 {3513 sk->cong_window++;
3514 sk->cong_count = 0;
3515 }3516 else3517 sk->cong_count++;
3518 }3519 }3520
3521 /*3522 * Remember the highest ack received.3523 */3524
3525 sk->rcv_ack_seq = ack;
3526
3527 /*3528 * If this ack opens up a zero window, clear backoff. It was3529 * being used to time the probes, and is probably far higher than3530 * it needs to be for normal retransmission.3531 */3532
3533 if (sk->ip_xmit_timeout == TIME_PROBE0)
3534 {3535 sk->retransmits = 0; /* Our probe was answered */3536
3537 /*3538 * Was it a usable window open ?3539 */3540
3541 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3542 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3543 {3544 sk->backoff = 0;
3545
3546 /*3547 * Recompute rto from rtt. this eliminates any backoff.3548 */3549
3550 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3551 if (sk->rto > 120*HZ)
3552 sk->rto = 120*HZ;
3553 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3554 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3555 .2 of a second is going to need huge windows (SIGH) */3556 sk->rto = 20;
3557 }3558 }3559
3560 /* 3561 * See if we can take anything off of the retransmit queue.3562 */3563
3564 while(sk->send_head != NULL)
3565 {3566 /* Check for a bug. */3567 if (sk->send_head->link3 &&
3568 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3569 printk("INET: tcp.c: *** bug send_list out of order.\n");
3570
3571 /*3572 * If our packet is before the ack sequence we can3573 * discard it as it's confirmed to have arrived the other end.3574 */3575
3576 if (before(sk->send_head->h.seq, ack+1))
3577 {3578 structsk_buff *oskb;
3579 if (sk->retransmits)
3580 {3581 /*3582 * We were retransmitting. don't count this in RTT est 3583 */3584 flag |= 2;
3585
3586 /*3587 * even though we've gotten an ack, we're still3588 * retransmitting as long as we're sending from3589 * the retransmit queue. Keeping retransmits non-zero3590 * prevents us from getting new data interspersed with3591 * retransmissions.3592 */3593
3594 if (sk->send_head->link3) /* Any more queued retransmits? */3595 sk->retransmits = 1;
3596 else3597 sk->retransmits = 0;
3598 }3599 /*3600 * Note that we only reset backoff and rto in the3601 * rtt recomputation code. And that doesn't happen3602 * if there were retransmissions in effect. So the3603 * first new packet after the retransmissions is3604 * sent with the backoff still in effect. Not until3605 * we get an ack from a non-retransmitted packet do3606 * we reset the backoff and rto. This allows us to deal3607 * with a situation where the network delay has increased3608 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3609 */3610
3611 /*3612 * We have one less packet out there. 3613 */3614
3615 if (sk->packets_out > 0)
3616 sk->packets_out --;
3617 /* 3618 * Wake up the process, it can probably write more. 3619 */3620 if (!sk->dead)
3621 sk->write_space(sk);
3622 oskb = sk->send_head;
3623
3624 if (!(flag&2)) /* Not retransmitting */3625 {3626 longm;
3627
3628 /*3629 * The following amusing code comes from Jacobson's3630 * article in SIGCOMM '88. Note that rtt and mdev3631 * are scaled versions of rtt and mean deviation.3632 * This is designed to be as fast as possible 3633 * m stands for "measurement".3634 */3635
3636 m = jiffies - oskb->when; /* RTT */3637 if(m<=0)
3638 m=1; /* IS THIS RIGHT FOR <0 ??? */3639 m -= (sk->rtt >> 3); /* m is now error in rtt est */3640 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3641 if (m < 0)
3642 m = -m; /* m is now abs(error) */3643 m -= (sk->mdev >> 2); /* similar update on mdev */3644 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3645
3646 /*3647 * Now update timeout. Note that this removes any backoff.3648 */3649
3650 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3651 if (sk->rto > 120*HZ)
3652 sk->rto = 120*HZ;
3653 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3654 sk->rto = 20;
3655 sk->backoff = 0;
3656 }3657 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3658 In this case as we just set it up */3659 cli();
3660 oskb = sk->send_head;
3661 IS_SKB(oskb);
3662 sk->send_head = oskb->link3;
3663 if (sk->send_head == NULL)
3664 {3665 sk->send_tail = NULL;
3666 }3667
3668 /*3669 * We may need to remove this from the dev send list. 3670 */3671
3672 if (oskb->next)
3673 skb_unlink(oskb);
3674 sti();
3675 kfree_skb(oskb, FREE_WRITE); /* write. */3676 if (!sk->dead)
3677 sk->write_space(sk);
3678 }3679 else3680 {3681 break;
3682 }3683 }3684
3685 /*3686 * XXX someone ought to look at this too.. at the moment, if skb_peek()3687 * returns non-NULL, we complete ignore the timer stuff in the else3688 * clause. We ought to organize the code so that else clause can3689 * (should) be executed regardless, possibly moving the PROBE timer3690 * reset over. The skb_peek() thing should only move stuff to the3691 * write queue, NOT also manage the timer functions.3692 */3693
3694 /*3695 * Maybe we can take some stuff off of the write queue,3696 * and put it onto the xmit queue.3697 */3698 if (skb_peek(&sk->write_queue) != NULL)
3699 {3700 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3701 (sk->retransmits == 0 ||
3702 sk->ip_xmit_timeout != TIME_WRITE ||
3703 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3704 && sk->packets_out < sk->cong_window)
3705 {3706 /*3707 * Add more data to the send queue.3708 */3709 flag |= 1;
3710 tcp_write_xmit(sk);
3711 }3712 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3713 sk->send_head == NULL &&
3714 sk->ack_backlog == 0 &&
3715 sk->state != TCP_TIME_WAIT)
3716 {3717 /*3718 * Data to queue but no room.3719 */3720 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3721 }3722 }3723 else3724 {3725 /*3726 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3727 * from TCP_CLOSE we don't do anything3728 *3729 * from anything else, if there is write data (or fin) pending,3730 * we use a TIME_WRITE timeout, else if keepalive we reset to3731 * a KEEPALIVE timeout, else we delete the timer.3732 *3733 * We do not set flag for nominal write data, otherwise we may3734 * force a state where we start to write itsy bitsy tidbits3735 * of data.3736 */3737
3738 switch(sk->state) {3739 caseTCP_TIME_WAIT:
3740 /*3741 * keep us in TIME_WAIT until we stop getting packets,3742 * reset the timeout.3743 */3744 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3745 break;
3746 caseTCP_CLOSE:
3747 /*3748 * don't touch the timer.3749 */3750 break;
3751 default:
3752 /*3753 * Must check send_head, write_queue, and ack_backlog3754 * to determine which timeout to use.3755 */3756 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3757 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3758 }elseif (sk->keepopen) {3759 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3760 }else{3761 del_timer(&sk->retransmit_timer);
3762 sk->ip_xmit_timeout = 0;
3763 }3764 break;
3765 }3766 }3767
3768 /*3769 * We have nothing queued but space to send. Send any partial3770 * packets immediately (end of Nagle rule application).3771 */3772
3773 if (sk->packets_out == 0 && sk->partial != NULL &&
3774 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3775 {3776 flag |= 1;
3777 tcp_send_partial(sk);
3778 }3779
3780 /*3781 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3782 * we are now waiting for an acknowledge to our FIN. The other end is3783 * already in TIME_WAIT.3784 *3785 * Move to TCP_CLOSE on success.3786 */3787
3788 if (sk->state == TCP_LAST_ACK)
3789 {3790 if (!sk->dead)
3791 sk->state_change(sk);
3792 if(sk->debug)
3793 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3794 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3795 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3796 {3797 flag |= 1;
3798 tcp_set_state(sk,TCP_CLOSE);
3799 sk->shutdown = SHUTDOWN_MASK;
3800 }3801 }3802
3803 /*3804 * Incoming ACK to a FIN we sent in the case of our initiating the close.3805 *3806 * Move to FIN_WAIT2 to await a FIN from the other end. Set3807 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3808 */3809
3810 if (sk->state == TCP_FIN_WAIT1)
3811 {3812
3813 if (!sk->dead)
3814 sk->state_change(sk);
3815 if (sk->rcv_ack_seq == sk->write_seq)
3816 {3817 flag |= 1;
3818 sk->shutdown |= SEND_SHUTDOWN;
3819 tcp_set_state(sk, TCP_FIN_WAIT2);
3820 }3821 }3822
3823 /*3824 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3825 *3826 * Move to TIME_WAIT3827 */3828
3829 if (sk->state == TCP_CLOSING)
3830 {3831
3832 if (!sk->dead)
3833 sk->state_change(sk);
3834 if (sk->rcv_ack_seq == sk->write_seq)
3835 {3836 flag |= 1;
3837 tcp_time_wait(sk);
3838 }3839 }3840
3841 /*3842 * Final ack of a three way shake 3843 */3844
3845 if(sk->state==TCP_SYN_RECV)
3846 {3847 tcp_set_state(sk, TCP_ESTABLISHED);
3848 tcp_options(sk,th);
3849 sk->dummy_th.dest=th->source;
3850 sk->copied_seq = sk->acked_seq;
3851 if(!sk->dead)
3852 sk->state_change(sk);
3853 if(sk->max_window==0)
3854 {3855 sk->max_window=32; /* Sanity check */3856 sk->mss=min(sk->max_window,sk->mtu);
3857 }3858 }3859
3860 /*3861 * I make no guarantees about the first clause in the following3862 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3863 * what conditions "!flag" would be true. However I think the rest3864 * of the conditions would prevent that from causing any3865 * unnecessary retransmission. 3866 * Clearly if the first packet has expired it should be 3867 * retransmitted. The other alternative, "flag&2 && retransmits", is3868 * harder to explain: You have to look carefully at how and when the3869 * timer is set and with what timeout. The most recent transmission always3870 * sets the timer. So in general if the most recent thing has timed3871 * out, everything before it has as well. So we want to go ahead and3872 * retransmit some more. If we didn't explicitly test for this3873 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3874 * would not be true. If you look at the pattern of timing, you can3875 * show that rto is increased fast enough that the next packet would3876 * almost never be retransmitted immediately. Then you'd end up3877 * waiting for a timeout to send each packet on the retransmission3878 * queue. With my implementation of the Karn sampling algorithm,3879 * the timeout would double each time. The net result is that it would3880 * take a hideous amount of time to recover from a single dropped packet.3881 * It's possible that there should also be a test for TIME_WRITE, but3882 * I think as long as "send_head != NULL" and "retransmit" is on, we've3883 * got to be in real retransmission mode.3884 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3885 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3886 * As long as no further losses occur, this seems reasonable.3887 */3888
3889 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3890 (((flag&2) && sk->retransmits) ||
3891 (sk->send_head->when + sk->rto < jiffies)))
3892 {3893 if(sk->send_head->when + sk->rto < jiffies)
3894 tcp_retransmit(sk,0);
3895 else3896 {3897 tcp_do_retransmit(sk, 1);
3898 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3899 }3900 }3901
3902 return(1);
3903 }3904
3905
3906 /*3907 * Process the FIN bit. This now behaves as it is supposed to work3908 * and the FIN takes effect when it is validly part of sequence3909 * space. Not before when we get holes.3910 *3911 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3912 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3913 * TIME-WAIT)3914 *3915 * If we are in FINWAIT-1, a received FIN indicates simultaneous3916 * close and we go into CLOSING (and later onto TIME-WAIT)3917 *3918 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3919 *3920 */3921
3922 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3923 {3924 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3925
3926 if (!sk->dead)
3927 {3928 sk->state_change(sk);
3929 sock_wake_async(sk->socket, 1);
3930 }3931
3932 switch(sk->state)
3933 {3934 caseTCP_SYN_RECV:
3935 caseTCP_SYN_SENT:
3936 caseTCP_ESTABLISHED:
3937 /*3938 * move to CLOSE_WAIT, tcp_data() already handled3939 * sending the ack.3940 */3941 tcp_set_state(sk,TCP_CLOSE_WAIT);
3942 if (th->rst)
3943 sk->shutdown = SHUTDOWN_MASK;
3944 break;
3945
3946 caseTCP_CLOSE_WAIT:
3947 caseTCP_CLOSING:
3948 /*3949 * received a retransmission of the FIN, do3950 * nothing.3951 */3952 break;
3953 caseTCP_TIME_WAIT:
3954 /*3955 * received a retransmission of the FIN,3956 * restart the TIME_WAIT timer.3957 */3958 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3959 return(0);
3960 caseTCP_FIN_WAIT1:
3961 /*3962 * This case occurs when a simultaneous close3963 * happens, we must ack the received FIN and3964 * enter the CLOSING state.3965 *3966 * This causes a WRITE timeout, which will either3967 * move on to TIME_WAIT when we timeout, or resend3968 * the FIN properly (maybe we get rid of that annoying3969 * FIN lost hang). The TIME_WRITE code is already correct3970 * for handling this timeout.3971 */3972
3973 if(sk->ip_xmit_timeout != TIME_WRITE)
3974 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3975 tcp_set_state(sk,TCP_CLOSING);
3976 break;
3977 caseTCP_FIN_WAIT2:
3978 /*3979 * received a FIN -- send ACK and enter TIME_WAIT3980 */3981 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3982 sk->shutdown|=SHUTDOWN_MASK;
3983 tcp_set_state(sk,TCP_TIME_WAIT);
3984 break;
3985 caseTCP_CLOSE:
3986 /*3987 * already in CLOSE3988 */3989 break;
3990 default:
3991 tcp_set_state(sk,TCP_LAST_ACK);
3992
3993 /* Start the timers. */3994 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3995 return(0);
3996 }3997
3998 return(0);
3999 }4000
4001
4002
4003 /*4004 * This routine handles the data. If there is room in the buffer,4005 * it will be have already been moved into it. If there is no4006 * room, then we will just have to discard the packet.4007 */4008
4009 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */4010 unsignedlongsaddr, unsignedshortlen)
4011 {4012 structsk_buff *skb1, *skb2;
4013 structtcphdr *th;
4014 intdup_dumped=0;
4015 u32new_seq, shut_seq;
4016
4017 th = skb->h.th;
4018 skb_pull(skb,th->doff*4);
4019 skb_trim(skb,len-(th->doff*4));
4020
4021 /*4022 * The bytes in the receive read/assembly queue has increased. Needed for the4023 * low memory discard algorithm 4024 */4025
4026 sk->bytes_rcv += skb->len;
4027
4028 if (skb->len == 0 && !th->fin)
4029 {4030 /* 4031 * Don't want to keep passing ack's back and forth. 4032 * (someone sent us dataless, boring frame)4033 */4034 if (!th->ack)
4035 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4036 kfree_skb(skb, FREE_READ);
4037 return(0);
4038 }4039
4040 /*4041 * We no longer have anyone receiving data on this connection.4042 */4043
4044 #ifndef TCP_DONT_RST_SHUTDOWN
4045
4046 if(sk->shutdown & RCV_SHUTDOWN)
4047 {4048 /*4049 * FIXME: BSD has some magic to avoid sending resets to4050 * broken 4.2 BSD keepalives. Much to my surprise a few non4051 * BSD stacks still have broken keepalives so we want to4052 * cope with it.4053 */4054
4055 if(skb->len) /* We don't care if it's just an ack or4056 a keepalive/window probe */4057 {4058 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */4059
4060 /* Do this the way 4.4BSD treats it. Not what I'd4061 regard as the meaning of the spec but it's what BSD4062 does and clearly they know everything 8) */4063
4064 /*4065 * This is valid because of two things4066 *4067 * a) The way tcp_data behaves at the bottom.4068 * b) A fin takes effect when read not when received.4069 */4070
4071 shut_seq=sk->acked_seq+1; /* Last byte */4072
4073 if(after(new_seq,shut_seq))
4074 {4075 if(sk->debug)
4076 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4077 sk, new_seq, shut_seq, sk->blog);
4078 if(sk->dead)
4079 {4080 sk->acked_seq = new_seq + th->fin;
4081 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4082 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4083 tcp_statistics.TcpEstabResets++;
4084 tcp_set_state(sk,TCP_CLOSE);
4085 sk->err = EPIPE;
4086 sk->shutdown = SHUTDOWN_MASK;
4087 kfree_skb(skb, FREE_READ);
4088 return 0;
4089 }4090 }4091 }4092 }4093
4094 #endif4095
4096 /*4097 * Now we have to walk the chain, and figure out where this one4098 * goes into it. This is set up so that the last packet we received4099 * will be the first one we look at, that way if everything comes4100 * in order, there will be no performance loss, and if they come4101 * out of order we will be able to fit things in nicely.4102 *4103 * [AC: This is wrong. We should assume in order first and then walk4104 * forwards from the first hole based upon real traffic patterns.]4105 * 4106 */4107
4108 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */4109 {4110 skb_queue_head(&sk->receive_queue,skb);
4111 skb1= NULL;
4112 }4113 else4114 {4115 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4116 {4117 if(sk->debug)
4118 {4119 printk("skb1=%p :", skb1);
4120 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4121 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4122 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4123 sk->acked_seq);
4124 }4125
4126 /*4127 * Optimisation: Duplicate frame or extension of previous frame from4128 * same sequence point (lost ack case).4129 * The frame contains duplicate data or replaces a previous frame4130 * discard the previous frame (safe as sk->inuse is set) and put4131 * the new one in its place.4132 */4133
4134 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4135 {4136 skb_append(skb1,skb);
4137 skb_unlink(skb1);
4138 kfree_skb(skb1,FREE_READ);
4139 dup_dumped=1;
4140 skb1=NULL;
4141 break;
4142 }4143
4144 /*4145 * Found where it fits4146 */4147
4148 if (after(th->seq+1, skb1->h.th->seq))
4149 {4150 skb_append(skb1,skb);
4151 break;
4152 }4153
4154 /*4155 * See if we've hit the start. If so insert.4156 */4157 if (skb1 == skb_peek(&sk->receive_queue))
4158 {4159 skb_queue_head(&sk->receive_queue, skb);
4160 break;
4161 }4162 }4163 }4164
4165 /*4166 * Figure out what the ack value for this frame is4167 */4168
4169 th->ack_seq = th->seq + skb->len;
4170 if (th->syn)
4171 th->ack_seq++;
4172 if (th->fin)
4173 th->ack_seq++;
4174
4175 if (before(sk->acked_seq, sk->copied_seq))
4176 {4177 printk("*** tcp.c:tcp_data bug acked < copied\n");
4178 sk->acked_seq = sk->copied_seq;
4179 }4180
4181 /*4182 * Now figure out if we can ack anything. This is very messy because we really want two4183 * receive queues, a completed and an assembly queue. We also want only one transmit4184 * queue.4185 */4186
4187 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
4188 {4189 if (before(th->seq, sk->acked_seq+1))
4190 {4191 intnewwindow;
4192
4193 if (after(th->ack_seq, sk->acked_seq))
4194 {4195 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4196 if (newwindow < 0)
4197 newwindow = 0;
4198 sk->window = newwindow;
4199 sk->acked_seq = th->ack_seq;
4200 }4201 skb->acked = 1;
4202
4203 /*4204 * When we ack the fin, we do the FIN 4205 * processing.4206 */4207
4208 if (skb->h.th->fin)
4209 {4210 tcp_fin(skb,sk,skb->h.th);
4211 }4212
4213 for(skb2 = skb->next;
4214 skb2 != (structsk_buff *)&sk->receive_queue;
4215 skb2 = skb2->next)
4216 {4217 if (before(skb2->h.th->seq, sk->acked_seq+1))
4218 {4219 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4220 {4221 newwindow = sk->window -
4222 (skb2->h.th->ack_seq - sk->acked_seq);
4223 if (newwindow < 0)
4224 newwindow = 0;
4225 sk->window = newwindow;
4226 sk->acked_seq = skb2->h.th->ack_seq;
4227 }4228 skb2->acked = 1;
4229 /*4230 * When we ack the fin, we do4231 * the fin handling.4232 */4233 if (skb2->h.th->fin)
4234 {4235 tcp_fin(skb,sk,skb->h.th);
4236 }4237
4238 /*4239 * Force an immediate ack.4240 */4241
4242 sk->ack_backlog = sk->max_ack_backlog;
4243 }4244 else4245 {4246 break;
4247 }4248 }4249
4250 /*4251 * This also takes care of updating the window.4252 * This if statement needs to be simplified.4253 */4254 if (!sk->delay_acks ||
4255 sk->ack_backlog >= sk->max_ack_backlog ||
4256 sk->bytes_rcv > sk->max_unacked || th->fin) {4257 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4258 }4259 else4260 {4261 sk->ack_backlog++;
4262 if(sk->debug)
4263 printk("Ack queued.\n");
4264 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4265 }4266 }4267 }4268
4269 /*4270 * If we've missed a packet, send an ack.4271 * Also start a timer to send another.4272 */4273
4274 if (!skb->acked)
4275 {4276
4277 /*4278 * This is important. If we don't have much room left,4279 * we need to throw out a few packets so we have a good4280 * window. Note that mtu is used, not mss, because mss is really4281 * for the send side. He could be sending us stuff as large as mtu.4282 */4283
4284 while (sk->prot->rspace(sk) < sk->mtu)
4285 {4286 skb1 = skb_peek(&sk->receive_queue);
4287 if (skb1 == NULL)
4288 {4289 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4290 break;
4291 }4292
4293 /*4294 * Don't throw out something that has been acked. 4295 */4296
4297 if (skb1->acked)
4298 {4299 break;
4300 }4301
4302 skb_unlink(skb1);
4303 kfree_skb(skb1, FREE_READ);
4304 }4305 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4306 sk->ack_backlog++;
4307 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4308 }4309 else4310 {4311 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4312 }4313
4314 /*4315 * Now tell the user we may have some data. 4316 */4317
4318 if (!sk->dead)
4319 {4320 if(sk->debug)
4321 printk("Data wakeup.\n");
4322 sk->data_ready(sk,0);
4323 }4324 return(0);
4325 }4326
4327
4328 /*4329 * This routine is only called when we have urgent data4330 * signalled. Its the 'slow' part of tcp_urg. It could be4331 * moved inline now as tcp_urg is only called from one4332 * place. We handle URGent data wrong. We have to - as4333 * BSD still doesn't use the correction from RFC961.4334 */4335
4336 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4337 {4338 u32ptr = ntohs(th->urg_ptr);
4339
4340 if (ptr)
4341 ptr--;
4342 ptr += th->seq;
4343
4344 /* ignore urgent data that we've already seen and read */4345 if (after(sk->copied_seq, ptr))
4346 return;
4347
4348 /* do we already have a newer (or duplicate) urgent pointer? */4349 if (sk->urg_data && !after(ptr, sk->urg_seq))
4350 return;
4351
4352 /* tell the world about our new urgent pointer */4353 if (sk->proc != 0) {4354 if (sk->proc > 0) {4355 kill_proc(sk->proc, SIGURG, 1);
4356 }else{4357 kill_pg(-sk->proc, SIGURG, 1);
4358 }4359 }4360 sk->urg_data = URG_NOTYET;
4361 sk->urg_seq = ptr;
4362 }4363
4364 /*4365 * This is the 'fast' part of urgent handling.4366 */4367
4368 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4369 unsignedlongsaddr, unsignedlonglen)
4370 {4371 u32ptr;
4372
4373 /*4374 * Check if we get a new urgent pointer - normally not 4375 */4376
4377 if (th->urg)
4378 tcp_check_urg(sk,th);
4379
4380 /*4381 * Do we wait for any urgent data? - normally not4382 */4383
4384 if (sk->urg_data != URG_NOTYET)
4385 return 0;
4386
4387 /*4388 * Is the urgent pointer pointing into this packet? 4389 */4390
4391 ptr = sk->urg_seq - th->seq + th->doff*4;
4392 if (ptr >= len)
4393 return 0;
4394
4395 /*4396 * Ok, got the correct packet, update info 4397 */4398
4399 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4400 if (!sk->dead)
4401 sk->data_ready(sk,0);
4402 return 0;
4403 }4404
4405 /*4406 * This will accept the next outstanding connection. 4407 */4408
4409 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4410 {4411 structsock *newsk;
4412 structsk_buff *skb;
4413
4414 /*4415 * We need to make sure that this socket is listening,4416 * and that it has something pending.4417 */4418
4419 if (sk->state != TCP_LISTEN)
4420 {4421 sk->err = EINVAL;
4422 return(NULL);
4423 }4424
4425 /* Avoid the race. */4426 cli();
4427 sk->inuse = 1;
4428
4429 while((skb = tcp_dequeue_established(sk)) == NULL)
4430 {4431 if (flags & O_NONBLOCK)
4432 {4433 sti();
4434 release_sock(sk);
4435 sk->err = EAGAIN;
4436 return(NULL);
4437 }4438
4439 release_sock(sk);
4440 interruptible_sleep_on(sk->sleep);
4441 if (current->signal & ~current->blocked)
4442 {4443 sti();
4444 sk->err = ERESTARTSYS;
4445 return(NULL);
4446 }4447 sk->inuse = 1;
4448 }4449 sti();
4450
4451 /*4452 * Now all we need to do is return skb->sk. 4453 */4454
4455 newsk = skb->sk;
4456
4457 kfree_skb(skb, FREE_READ);
4458 sk->ack_backlog--;
4459 release_sock(sk);
4460 return(newsk);
4461 }4462
4463
4464 /*4465 * This will initiate an outgoing connection. 4466 */4467
4468 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4469 {4470 structsk_buff *buff;
4471 structdevice *dev=NULL;
4472 unsignedchar *ptr;
4473 inttmp;
4474 intatype;
4475 structtcphdr *t1;
4476 structrtable *rt;
4477
4478 if (sk->state != TCP_CLOSE)
4479 {4480 return(-EISCONN);
4481 }4482
4483 if (addr_len < 8)
4484 return(-EINVAL);
4485
4486 if (usin->sin_family && usin->sin_family != AF_INET)
4487 return(-EAFNOSUPPORT);
4488
4489 /*4490 * connect() to INADDR_ANY means loopback (BSD'ism).4491 */4492
4493 if(usin->sin_addr.s_addr==INADDR_ANY)
4494 usin->sin_addr.s_addr=ip_my_addr();
4495
4496 /*4497 * Don't want a TCP connection going to a broadcast address 4498 */4499
4500 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4501 return -ENETUNREACH;
4502
4503 sk->inuse = 1;
4504 sk->daddr = usin->sin_addr.s_addr;
4505 sk->write_seq = tcp_init_seq();
4506 sk->window_seq = sk->write_seq;
4507 sk->rcv_ack_seq = sk->write_seq -1;
4508 sk->err = 0;
4509 sk->dummy_th.dest = usin->sin_port;
4510 release_sock(sk);
4511
4512 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4513 if (buff == NULL)
4514 {4515 return(-ENOMEM);
4516 }4517 sk->inuse = 1;
4518 buff->sk = sk;
4519 buff->free = 0;
4520 buff->localroute = sk->localroute;
4521
4522
4523 /*4524 * Put in the IP header and routing stuff. 4525 */4526
4527 if (sk->localroute)
4528 rt=ip_rt_local(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4529 else4530 rt=ip_rt_route(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4531
4532 /*4533 * We need to build the routing stuff from the things saved in skb. 4534 */4535
4536 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4537 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4538 if (tmp < 0)
4539 {4540 sk->prot->wfree(sk, buff);
4541 release_sock(sk);
4542 return(-ENETUNREACH);
4543 }4544
4545 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
4546
4547 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4548 t1->seq = ntohl(sk->write_seq++);
4549 sk->sent_seq = sk->write_seq;
4550 buff->h.seq = sk->write_seq;
4551 t1->ack = 0;
4552 t1->window = 2;
4553 t1->res1=0;
4554 t1->res2=0;
4555 t1->rst = 0;
4556 t1->urg = 0;
4557 t1->psh = 0;
4558 t1->syn = 1;
4559 t1->urg_ptr = 0;
4560 t1->doff = 6;
4561 /* use 512 or whatever user asked for */4562
4563 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4564 sk->window_clamp=rt->rt_window;
4565 else4566 sk->window_clamp=0;
4567
4568 if (sk->user_mss)
4569 sk->mtu = sk->user_mss;
4570 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
4571 sk->mtu = rt->rt_mss;
4572 else4573 {4574 #ifdefCONFIG_INET_SNARL4575 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4576 #else4577 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4578 #endif4579 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
4580 else4581 sk->mtu = MAX_WINDOW;
4582 }4583 /*4584 * but not bigger than device MTU 4585 */4586
4587 if(sk->mtu <32)
4588 sk->mtu = 32; /* Sanity limit */4589
4590 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
4591
4592 /*4593 * Put in the TCP options to say MTU. 4594 */4595
4596 ptr = skb_put(buff,4);
4597 ptr[0] = 2;
4598 ptr[1] = 4;
4599 ptr[2] = (sk->mtu) >> 8;
4600 ptr[3] = (sk->mtu) & 0xff;
4601 tcp_send_check(t1, sk->saddr, sk->daddr,
4602 sizeof(structtcphdr) + 4, sk);
4603
4604 /*4605 * This must go first otherwise a really quick response will get reset. 4606 */4607
4608 tcp_cache_zap();
4609 tcp_set_state(sk,TCP_SYN_SENT);
4610 if(rt&&rt->rt_flags&RTF_IRTT)
4611 sk->rto = rt->rt_irtt;
4612 else4613 sk->rto = TCP_TIMEOUT_INIT;
4614 sk->retransmit_timer.function=&retransmit_timer;
4615 sk->retransmit_timer.data = (unsignedlong)sk;
4616 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4617 sk->retransmits = 0; /* Now works the right way instead of a hacked initial setting */4618
4619 sk->prot->queue_xmit(sk, dev, buff, 0);
4620 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4621 tcp_statistics.TcpActiveOpens++;
4622 tcp_statistics.TcpOutSegs++;
4623
4624 release_sock(sk);
4625 return(0);
4626 }4627
4628
4629 /* This functions checks to see if the tcp header is actually acceptable. */4630 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4631 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4632 {4633 u32next_seq;
4634
4635 next_seq = len - 4*th->doff;
4636 if (th->fin)
4637 next_seq++;
4638 /* if we have a zero window, we can't have any data in the packet.. */4639 if (next_seq && !sk->window)
4640 gotoignore_it;
4641 next_seq += th->seq;
4642
4643 /*4644 * This isn't quite right. sk->acked_seq could be more recent4645 * than sk->window. This is however close enough. We will accept4646 * slightly more packets than we should, but it should not cause4647 * problems unless someone is trying to forge packets.4648 */4649
4650 /* have we already seen all of this packet? */4651 if (!after(next_seq+1, sk->acked_seq))
4652 gotoignore_it;
4653 /* or does it start beyond the window? */4654 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4655 gotoignore_it;
4656
4657 /* ok, at least part of this packet would seem interesting.. */4658 return 1;
4659
4660 ignore_it:
4661 if (th->rst)
4662 return 0;
4663
4664 /*4665 * Send a reset if we get something not ours and we are4666 * unsynchronized. Note: We don't do anything to our end. We4667 * are just killing the bogus remote connection then we will4668 * connect again and it will work (with luck).4669 */4670
4671 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4672 {4673 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4674 return 1;
4675 }4676
4677 /* Try to resync things. */4678 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4679 return 0;
4680 }4681
4682 /*4683 * When we get a reset we do this.4684 */4685
4686 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4687 {4688 sk->zapped = 1;
4689 sk->err = ECONNRESET;
4690 if (sk->state == TCP_SYN_SENT)
4691 sk->err = ECONNREFUSED;
4692 if (sk->state == TCP_CLOSE_WAIT)
4693 sk->err = EPIPE;
4694 #ifdef TCP_DO_RFC1337
4695 /*4696 * Time wait assassination protection [RFC1337]4697 */4698 if(sk->state!=TCP_TIME_WAIT)
4699 {4700 tcp_set_state(sk,TCP_CLOSE);
4701 sk->shutdown = SHUTDOWN_MASK;
4702 }4703 #else4704 tcp_set_state(sk,TCP_CLOSE);
4705 sk->shutdown = SHUTDOWN_MASK;
4706 #endif4707 if (!sk->dead)
4708 sk->state_change(sk);
4709 kfree_skb(skb, FREE_READ);
4710 release_sock(sk);
4711 return(0);
4712 }4713
4714 /*4715 * A TCP packet has arrived.4716 * skb->h.raw is the TCP header.4717 */4718
4719 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4720 __u32daddr, unsignedshortlen,
4721 __u32saddr, intredo, structinet_protocol * protocol)
4722 {4723 structtcphdr *th;
4724 structsock *sk;
4725 intsyn_ok=0;
4726
4727 tcp_statistics.TcpInSegs++;
4728 if(skb->pkt_type!=PACKET_HOST)
4729 {4730 kfree_skb(skb,FREE_READ);
4731 return(0);
4732 }4733
4734 th = skb->h.th;
4735
4736 /*4737 * Find the socket, using the last hit cache if applicable.4738 */4739
4740 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4741 {4742 sk=(structsock *)th_cache_sk;
4743 /*4744 * We think this is causing the bug so4745 */4746 if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4747 printk("Cache mismatch on TCP.\n");
4748 }4749 else4750 {4751 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4752 th_cache_saddr=saddr;
4753 th_cache_daddr=daddr;
4754 th_cache_dport=th->dest;
4755 th_cache_sport=th->source;
4756 th_cache_sk=sk;
4757 }4758
4759 /*4760 * If this socket has got a reset it's to all intents and purposes 4761 * really dead. Count closed sockets as dead.4762 *4763 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4764 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4765 * exist so should cause resets as if the port was unreachable.4766 */4767
4768 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4769 sk=NULL;
4770
4771 if (!redo)
4772 {4773 /*4774 * Pull up the IP header.4775 */4776 skb_pull(skb, skb->h.raw-skb->data);
4777 /*4778 * Try to use the device checksum if provided.4779 */4780 if (
4781 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4782 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4783 )
4784 {4785 skb->sk = NULL;
4786 kfree_skb(skb,FREE_READ);
4787 /*4788 * We don't release the socket because it was4789 * never marked in use.4790 */4791 return(0);
4792 }4793 th->seq = ntohl(th->seq);
4794
4795 /* See if we know about the socket. */4796 if (sk == NULL)
4797 {4798 /*4799 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4800 */4801 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4802 skb->sk = NULL;
4803 /*4804 * Discard frame4805 */4806 kfree_skb(skb, FREE_READ);
4807 return(0);
4808 }4809
4810 /* skb->len = len;*/4811 skb->acked = 0;
4812 skb->used = 0;
4813 skb->free = 0;
4814 skb->saddr = daddr;
4815 skb->daddr = saddr;
4816
4817 /* We may need to add it to the backlog here. */4818 cli();
4819 if (sk->inuse)
4820 {4821 skb_queue_tail(&sk->back_log, skb);
4822 sti();
4823 return(0);
4824 }4825 sk->inuse = 1;
4826 sti();
4827 }4828 else4829 {4830 if (sk==NULL)
4831 {4832 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4833 skb->sk = NULL;
4834 kfree_skb(skb, FREE_READ);
4835 return(0);
4836 }4837 }4838
4839
4840 if (!sk->prot)
4841 {4842 printk("IMPOSSIBLE 3\n");
4843 return(0);
4844 }4845
4846
4847 /*4848 * Charge the memory to the socket. 4849 */4850
4851 if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf)
4852 {4853 kfree_skb(skb, FREE_READ);
4854 release_sock(sk);
4855 return(0);
4856 }4857
4858 skb->sk=sk;
4859 sk->rmem_alloc += skb->truesize;
4860
4861 /*4862 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4863 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4864 * compatibility. We also set up variables more thoroughly [Karn notes in the4865 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4866 */4867
4868 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4869 {4870
4871 /*4872 * Now deal with unusual cases.4873 */4874
4875 if(sk->state==TCP_LISTEN)
4876 {4877 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4878 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4879
4880 /*4881 * We don't care for RST, and non SYN are absorbed (old segments)4882 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4883 * netmask on a running connection it can go broadcast. Even Sun's have4884 * this problem so I'm ignoring it 4885 */4886
4887 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4888 {4889 kfree_skb(skb, FREE_READ);
4890 release_sock(sk);
4891 return 0;
4892 }4893
4894 /* 4895 * Guess we need to make a new socket up 4896 */4897
4898 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4899
4900 /*4901 * Now we have several options: In theory there is nothing else4902 * in the frame. KA9Q has an option to send data with the syn,4903 * BSD accepts data with the syn up to the [to be] advertised window4904 * and Solaris 2.1 gives you a protocol error. For now we just ignore4905 * it, that fits the spec precisely and avoids incompatibilities. It4906 * would be nice in future to drop through and process the data.4907 */4908
4909 release_sock(sk);
4910 return 0;
4911 }4912
4913 /* retransmitted SYN? */4914 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4915 {4916 kfree_skb(skb, FREE_READ);
4917 release_sock(sk);
4918 return 0;
4919 }4920
4921 /*4922 * SYN sent means we have to look for a suitable ack and either reset4923 * for bad matches or go to connected 4924 */4925
4926 if(sk->state==TCP_SYN_SENT)
4927 {4928 /* Crossed SYN or previous junk segment */4929 if(th->ack)
4930 {4931 /* We got an ack, but it's not a good ack */4932 if(!tcp_ack(sk,th,saddr,len))
4933 {4934 /* Reset the ack - its an ack from a 4935 different connection [ th->rst is checked in tcp_reset()] */4936 tcp_statistics.TcpAttemptFails++;
4937 tcp_reset(daddr, saddr, th,
4938 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4939 kfree_skb(skb, FREE_READ);
4940 release_sock(sk);
4941 return(0);
4942 }4943 if(th->rst)
4944 returntcp_std_reset(sk,skb);
4945 if(!th->syn)
4946 {4947 /* A valid ack from a different connection4948 start. Shouldn't happen but cover it */4949 kfree_skb(skb, FREE_READ);
4950 release_sock(sk);
4951 return 0;
4952 }4953 /*4954 * Ok.. it's good. Set up sequence numbers and4955 * move to established.4956 */4957 syn_ok=1; /* Don't reset this connection for the syn */4958 sk->acked_seq=th->seq+1;
4959 sk->fin_seq=th->seq;
4960 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4961 tcp_set_state(sk, TCP_ESTABLISHED);
4962 tcp_options(sk,th);
4963 sk->dummy_th.dest=th->source;
4964 sk->copied_seq = sk->acked_seq;
4965 if(!sk->dead)
4966 {4967 sk->state_change(sk);
4968 sock_wake_async(sk->socket, 0);
4969 }4970 if(sk->max_window==0)
4971 {4972 sk->max_window = 32;
4973 sk->mss = min(sk->max_window, sk->mtu);
4974 }4975 }4976 else4977 {4978 /* See if SYN's cross. Drop if boring */4979 if(th->syn && !th->rst)
4980 {4981 /* Crossed SYN's are fine - but talking to4982 yourself is right out... */4983 if(sk->saddr==saddr && sk->daddr==daddr &&
4984 sk->dummy_th.source==th->source &&
4985 sk->dummy_th.dest==th->dest)
4986 {4987 tcp_statistics.TcpAttemptFails++;
4988 returntcp_std_reset(sk,skb);
4989 }4990 tcp_set_state(sk,TCP_SYN_RECV);
4991
4992 /*4993 * FIXME:4994 * Must send SYN|ACK here4995 */4996 }4997 /* Discard junk segment */4998 kfree_skb(skb, FREE_READ);
4999 release_sock(sk);
5000 return 0;
5001 }5002 /*5003 * SYN_RECV with data maybe.. drop through5004 */5005 gotorfc_step6;
5006 }5007
5008 /*5009 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is5010 * a more complex suggestion for fixing these reuse issues in RFC16445011 * but not yet ready for general use. Also see RFC1379.5012 */5013
5014 #defineBSD_TIME_WAIT5015 #ifdefBSD_TIME_WAIT5016 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5017 after(th->seq, sk->acked_seq) && !th->rst)
5018 {5019 u32seq = sk->write_seq;
5020 if(sk->debug)
5021 printk("Doing a BSD time wait\n");
5022 tcp_statistics.TcpEstabResets++;
5023 sk->rmem_alloc -= skb->truesize;
5024 skb->sk = NULL;
5025 sk->err=ECONNRESET;
5026 tcp_set_state(sk, TCP_CLOSE);
5027 sk->shutdown = SHUTDOWN_MASK;
5028 release_sock(sk);
5029 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5030 if (sk && sk->state==TCP_LISTEN)
5031 {5032 sk->inuse=1;
5033 skb->sk = sk;
5034 sk->rmem_alloc += skb->truesize;
5035 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5036 release_sock(sk);
5037 return 0;
5038 }5039 kfree_skb(skb, FREE_READ);
5040 return 0;
5041 }5042 #endif5043 }5044
5045 /*5046 * We are now in normal data flow (see the step list in the RFC)5047 * Note most of these are inline now. I'll inline the lot when5048 * I have time to test it hard and look at what gcc outputs 5049 */5050
5051 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5052 {5053 kfree_skb(skb, FREE_READ);
5054 release_sock(sk);
5055 return 0;
5056 }5057
5058 if(th->rst)
5059 returntcp_std_reset(sk,skb);
5060
5061 /*5062 * !syn_ok is effectively the state test in RFC793.5063 */5064
5065 if(th->syn && !syn_ok)
5066 {5067 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5068 returntcp_std_reset(sk,skb);
5069 }5070
5071 /*5072 * Process the ACK5073 */5074
5075
5076 if(th->ack && !tcp_ack(sk,th,saddr,len))
5077 {5078 /*5079 * Our three way handshake failed.5080 */5081
5082 if(sk->state==TCP_SYN_RECV)
5083 {5084 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5085 }5086 kfree_skb(skb, FREE_READ);
5087 release_sock(sk);
5088 return 0;
5089 }5090
5091 rfc_step6: /* I'll clean this up later */5092
5093 /*5094 * Process urgent data5095 */5096
5097 if(tcp_urg(sk, th, saddr, len))
5098 {5099 kfree_skb(skb, FREE_READ);
5100 release_sock(sk);
5101 return 0;
5102 }5103
5104
5105 /*5106 * Process the encapsulated data5107 */5108
5109 if(tcp_data(skb,sk, saddr, len))
5110 {5111 kfree_skb(skb, FREE_READ);
5112 release_sock(sk);
5113 return 0;
5114 }5115
5116 /*5117 * And done5118 */5119
5120 release_sock(sk);
5121 return 0;
5122 }5123
5124 /*5125 * This routine sends a packet with an out of date sequence5126 * number. It assumes the other end will try to ack it.5127 */5128
5129 staticvoidtcp_write_wakeup(structsock *sk)
/* */5130 {5131 structsk_buff *buff,*skb;
5132 structtcphdr *t1;
5133 structdevice *dev=NULL;
5134 inttmp;
5135
5136 if (sk->zapped)
5137 return; /* After a valid reset we can send no more */5138
5139 /*5140 * Write data can still be transmitted/retransmitted in the5141 * following states. If any other state is encountered, return.5142 * [listen/close will never occur here anyway]5143 */5144
5145 if (sk->state != TCP_ESTABLISHED &&
5146 sk->state != TCP_CLOSE_WAIT &&
5147 sk->state != TCP_FIN_WAIT1 &&
5148 sk->state != TCP_LAST_ACK &&
5149 sk->state != TCP_CLOSING5150 )
5151 {5152 return;
5153 }5154 if ( before(sk->sent_seq, sk->window_seq) &&
5155 (skb=skb_peek(&sk->write_queue)))
5156 {5157 /*5158 * We are probing the opening of a window5159 * but the window size is != 05160 * must have been a result SWS advoidance ( sender )5161 */5162
5163 structiphdr *iph;
5164 structtcphdr *th;
5165 structtcphdr *nth;
5166 unsignedlongwin_size;
5167 #if 0
5168 unsignedlongow_size;
5169 #endif5170 void * tcp_data_start;
5171
5172 /*5173 * How many bytes can we send ?5174 */5175
5176 win_size = sk->window_seq - sk->sent_seq;
5177
5178 /*5179 * Recover the buffer pointers5180 */5181
5182 iph = (structiphdr *)skb->ip_hdr;
5183 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
5184
5185 /*5186 * Grab the data for a temporary frame5187 */5188
5189 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 +
5190 (iph->ihl << 2) +
5191 sk->prot->max_header + 15,
5192 1, GFP_ATOMIC);
5193 if ( buff == NULL )
5194 return;
5195
5196 /* 5197 * If we strip the packet on the write queue we must5198 * be ready to retransmit this one 5199 */5200
5201 buff->free = /*0*/1;
5202
5203 buff->sk = sk;
5204 buff->localroute = sk->localroute;
5205
5206 /*5207 * Put headers on the new packet5208 */5209
5210 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5211 IPPROTO_TCP, sk->opt, buff->truesize,
5212 sk->ip_tos,sk->ip_ttl);
5213 if (tmp < 0)
5214 {5215 sk->prot->wfree(sk, buff);
5216 return;
5217 }5218
5219 /*5220 * Move the TCP header over5221 */5222
5223 buff->dev = dev;
5224
5225 nth = (structtcphdr *) skb_put(buff,th->doff*4);
5226
5227 memcpy(nth, th, th->doff * 4);
5228
5229 /*5230 * Correct the new header5231 */5232
5233 nth->ack = 1;
5234 nth->ack_seq = ntohl(sk->acked_seq);
5235 nth->window = ntohs(tcp_select_window(sk));
5236 nth->check = 0;
5237
5238 /*5239 * Find the first data byte.5240 */5241
5242 tcp_data_start = skb->data + skb->dev->hard_header_len +
5243 (iph->ihl << 2) + th->doff * 4;
5244
5245 /*5246 * Add it to our new buffer5247 */5248 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5249
5250 /*5251 * Remember our right edge sequence number.5252 */5253
5254 buff->h.seq = sk->sent_seq + win_size;
5255 sk->sent_seq = buff->h.seq; /* Hack */5256 #if 0
5257
5258 /*5259 * now: shrink the queue head segment 5260 */5261
5262 th->check = 0;
5263 ow_size = skb->len - win_size -
5264 ((unsignedlong) (tcp_data_start - (void *) skb->data));
5265
5266 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5267 skb_trim(skb,skb->len-win_size);
5268 sk->sent_seq += win_size;
5269 th->seq = htonl(sk->sent_seq);
5270 if (th->urg)
5271 {5272 unsignedshorturg_ptr;
5273
5274 urg_ptr = ntohs(th->urg_ptr);
5275 if (urg_ptr <= win_size)
5276 th->urg = 0;
5277 else5278 {5279 urg_ptr -= win_size;
5280 th->urg_ptr = htons(urg_ptr);
5281 nth->urg_ptr = htons(win_size);
5282 }5283 }5284 #else5285 if(th->urg && ntohs(th->urg_ptr) < win_size)
5286 nth->urg = 0;
5287 #endif5288
5289 /*5290 * Checksum the split buffer5291 */5292
5293 tcp_send_check(nth, sk->saddr, sk->daddr,
5294 nth->doff * 4 + win_size , sk);
5295 }5296 else5297 {5298 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5299 if (buff == NULL)
5300 return;
5301
5302 buff->free = 1;
5303 buff->sk = sk;
5304 buff->localroute = sk->localroute;
5305
5306 /*5307 * Put in the IP header and routing stuff. 5308 */5309
5310 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5311 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5312 if (tmp < 0)
5313 {5314 sk->prot->wfree(sk, buff);
5315 return;
5316 }5317
5318 t1 = (structtcphdr *)skb_put(buff,sizeof(structtcphdr));
5319 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5320
5321 /*5322 * Use a previous sequence.5323 * This should cause the other end to send an ack.5324 */5325
5326 t1->seq = htonl(sk->sent_seq-1);
5327 t1->ack = 1;
5328 t1->res1= 0;
5329 t1->res2= 0;
5330 t1->rst = 0;
5331 t1->urg = 0;
5332 t1->psh = 0;
5333 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5334 t1->syn = 0;
5335 t1->ack_seq = ntohl(sk->acked_seq);
5336 t1->window = ntohs(tcp_select_window(sk));
5337 t1->doff = sizeof(*t1)/4;
5338 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5339
5340 }5341
5342 /*5343 * Send it.5344 */5345
5346 sk->prot->queue_xmit(sk, dev, buff, 1);
5347 tcp_statistics.TcpOutSegs++;
5348 }5349
5350 /*5351 * A window probe timeout has occurred.5352 */5353
5354 voidtcp_send_probe0(structsock *sk)
/* */5355 {5356 if (sk->zapped)
5357 return; /* After a valid reset we can send no more */5358
5359 tcp_write_wakeup(sk);
5360
5361 sk->backoff++;
5362 sk->rto = min(sk->rto << 1, 120*HZ);
5363 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5364 sk->retransmits++;
5365 sk->prot->retransmits ++;
5366 }5367
5368 /*5369 * Socket option code for TCP. 5370 */5371
5372 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5373 {5374 intval,err;
5375
5376 if(level!=SOL_TCP)
5377 returnip_setsockopt(sk,level,optname,optval,optlen);
5378
5379 if (optval == NULL)
5380 return(-EINVAL);
5381
5382 err=verify_area(VERIFY_READ, optval, sizeof(int));
5383 if(err)
5384 returnerr;
5385
5386 val = get_user((int *)optval);
5387
5388 switch(optname)
5389 {5390 caseTCP_MAXSEG:
5391 /*5392 * values greater than interface MTU won't take effect. however at5393 * the point when this call is done we typically don't yet know5394 * which interface is going to be used5395 */5396 if(val<1||val>MAX_WINDOW)
5397 return -EINVAL;
5398 sk->user_mss=val;
5399 return 0;
5400 caseTCP_NODELAY:
5401 sk->nonagle=(val==0)?0:1;
5402 return 0;
5403 default:
5404 return(-ENOPROTOOPT);
5405 }5406 }5407
5408 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5409 {5410 intval,err;
5411
5412 if(level!=SOL_TCP)
5413 returnip_getsockopt(sk,level,optname,optval,optlen);
5414
5415 switch(optname)
5416 {5417 caseTCP_MAXSEG:
5418 val=sk->user_mss;
5419 break;
5420 caseTCP_NODELAY:
5421 val=sk->nonagle;
5422 break;
5423 default:
5424 return(-ENOPROTOOPT);
5425 }5426 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5427 if(err)
5428 returnerr;
5429 put_user(sizeof(int),(int *) optlen);
5430
5431 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5432 if(err)
5433 returnerr;
5434 put_user(val,(int *)optval);
5435
5436 return(0);
5437 }5438
5439
5440 structprototcp_prot = {5441 sock_wmalloc,
5442 sock_rmalloc,
5443 sock_wfree,
5444 sock_rfree,
5445 sock_rspace,
5446 sock_wspace,
5447 tcp_close,
5448 tcp_read,
5449 tcp_write,
5450 tcp_sendto,
5451 tcp_recvfrom,
5452 ip_build_header,
5453 tcp_connect,
5454 tcp_accept,
5455 ip_queue_xmit,
5456 tcp_retransmit,
5457 tcp_write_wakeup,
5458 tcp_read_wakeup,
5459 tcp_rcv,
5460 tcp_select,
5461 tcp_ioctl,
5462 NULL,
5463 tcp_shutdown,
5464 tcp_setsockopt,
5465 tcp_getsockopt,
5466 128,
5467 0,
5468 "TCP",
5469 0, 0,
5470 {NULL,}5471 };