1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications. 178 * Alan Cox : rcv_saddr errors. 179 * Alan Cox : Block double connect(). 180 * Alan Cox : Small hooks for enSKIP. 181 * Alexey Kuznetsov: Path MTU discovery. 182 * Alan Cox : Support soft errors. 183 * Alan Cox : Fix MTU discovery pathalogical case 184 * when the remote claims no mtu! 185 * Marc Tamsky : TCP_CLOSE fix. 186 * Colin (G3TNE) : Send a reset on syn ack replies in 187 * window but wrong (fixes NT lpd problems) 188 * Pedro Roque : Better TCP window handling, delayed ack. 189 * Joerg Reuter : No modification of locked buffers in 190 * tcp_do_retransmit() 191 * 192 * To Fix: 193 * Fast path the code. Two things here - fix the window calculation 194 * so it doesn't iterate over the queue, also spot packets with no funny 195 * options arriving in order and process directly. 196 * 197 * Rewrite output state machine to use a single queue. 198 * Speed up input assembly algorithm. 199 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 200 * could do with it working on IPv4 201 * User settable/learned rtt/max window/mtu 202 * 203 * Change the fundamental structure to a single send queue maintained 204 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 205 * active routes too]). Cut the queue off in tcp_retransmit/ 206 * tcp_transmit. 207 * Change the receive queue to assemble as it goes. This lets us 208 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 209 * tcp_data/tcp_read as well as the window shrink crud. 210 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 211 * tcp_queue_skb seem obvious routines to extract. 212 * 213 * This program is free software; you can redistribute it and/or 214 * modify it under the terms of the GNU General Public License 215 * as published by the Free Software Foundation; either version 216 * 2 of the License, or(at your option) any later version. 217 * 218 * Description of States: 219 * 220 * TCP_SYN_SENT sent a connection request, waiting for ack 221 * 222 * TCP_SYN_RECV received a connection request, sent ack, 223 * waiting for final ack in three-way handshake. 224 * 225 * TCP_ESTABLISHED connection established 226 * 227 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 228 * transmission of remaining buffered data 229 * 230 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 231 * to shutdown 232 * 233 * TCP_CLOSING both sides have shutdown but we still have 234 * data we have to finish sending 235 * 236 * TCP_TIME_WAIT timeout to catch resent junk before entering 237 * closed, can only be entered from FIN_WAIT2 238 * or CLOSING. Required because the other end 239 * may not have gotten our last ACK causing it 240 * to retransmit the data packet (which we ignore) 241 * 242 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 243 * us to finish writing our data and to shutdown 244 * (we have to close() to move on to LAST_ACK) 245 * 246 * TCP_LAST_ACK out side has shutdown after remote has 247 * shutdown. There may still be data in our 248 * buffer that we have to finish sending 249 * 250 * TCP_CLOSE socket is finished 251 */ 252
253 /* 254 * RFC1122 status: 255 * NOTE: I'm not going to be doing comments in the code for this one except 256 * for violations and the like. tcp.c is just too big... If I say something 257 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 258 * with Alan. -- MS 950903 259 * 260 * Use of PSH (4.2.2.2) 261 * MAY aggregate data sent without the PSH flag. (does) 262 * MAY queue data received without the PSH flag. (does) 263 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 264 * MAY implement PSH on send calls. (doesn't, thus:) 265 * MUST NOT buffer data indefinitely (doesn't [1 second]) 266 * MUST set PSH on last segment (does) 267 * MAY pass received PSH to application layer (doesn't) 268 * SHOULD send maximum-sized segment whenever possible. (almost always does) 269 * 270 * Window Size (4.2.2.3, 4.2.2.16) 271 * MUST treat window size as an unsigned number (does) 272 * SHOULD treat window size as a 32-bit number (does not) 273 * MUST NOT shrink window once it is offered (does not normally) 274 * 275 * Urgent Pointer (4.2.2.4) 276 * **MUST point urgent pointer to last byte of urgent data (not right 277 * after). (doesn't, to be like BSD) 278 * MUST inform application layer asynchronously of incoming urgent 279 * data. (does) 280 * MUST provide application with means of determining the amount of 281 * urgent data pending. (does) 282 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 283 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 284 * [Follows BSD 1 byte of urgent data] 285 * 286 * TCP Options (4.2.2.5) 287 * MUST be able to receive TCP options in any segment. (does) 288 * MUST ignore unsupported options (does) 289 * 290 * Maximum Segment Size Option (4.2.2.6) 291 * MUST implement both sending and receiving MSS. (does) 292 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send 293 * it always). (does, even when MSS == 536, which is legal) 294 * MUST assume MSS == 536 if no MSS received at connection setup (does) 295 * MUST calculate "effective send MSS" correctly: 296 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 297 * (does - but allows operator override) 298 * 299 * TCP Checksum (4.2.2.7) 300 * MUST generate and check TCP checksum. (does) 301 * 302 * Initial Sequence Number Selection (4.2.2.8) 303 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 304 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 305 * necessary for 10Mbps networks - and harder than BSD to spoof!) 306 * 307 * Simultaneous Open Attempts (4.2.2.10) 308 * MUST support simultaneous open attempts (does) 309 * 310 * Recovery from Old Duplicate SYN (4.2.2.11) 311 * MUST keep track of active vs. passive open (does) 312 * 313 * RST segment (4.2.2.12) 314 * SHOULD allow an RST segment to contain data (does, but doesn't do 315 * anything with it, which is standard) 316 * 317 * Closing a Connection (4.2.2.13) 318 * MUST inform application of whether connectin was closed by RST or 319 * normal close. (does) 320 * MAY allow "half-duplex" close (treat connection as closed for the 321 * local app, even before handshake is done). (does) 322 * MUST linger in TIME_WAIT for 2 * MSL (does) 323 * 324 * Retransmission Timeout (4.2.2.15) 325 * MUST implement Jacobson's slow start and congestion avoidance 326 * stuff. (does) 327 * 328 * Probing Zero Windows (4.2.2.17) 329 * MUST support probing of zero windows. (does) 330 * MAY keep offered window closed indefinitely. (does) 331 * MUST allow remote window to stay closed indefinitely. (does) 332 * 333 * Passive Open Calls (4.2.2.18) 334 * MUST NOT let new passive open affect other connections. (doesn't) 335 * MUST support passive opens (LISTENs) concurrently. (does) 336 * 337 * Time to Live (4.2.2.19) 338 * MUST make TCP TTL configurable. (does - IP_TTL option) 339 * 340 * Event Processing (4.2.2.20) 341 * SHOULD queue out-of-order segments. (does) 342 * MUST aggregate ACK segments whenever possible. (does but badly) 343 * 344 * Retransmission Timeout Calculation (4.2.3.1) 345 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 346 * calculation. (does, or at least explains them in the comments 8*b) 347 * SHOULD initialize RTO to 0 and RTT to 3. (does) 348 * 349 * When to Send an ACK Segment (4.2.3.2) 350 * SHOULD implement delayed ACK. (does) 351 * MUST keep ACK delay < 0.5 sec. (does) 352 * 353 * When to Send a Window Update (4.2.3.3) 354 * MUST implement receiver-side SWS. (does) 355 * 356 * When to Send Data (4.2.3.4) 357 * MUST implement sender-side SWS. (does) 358 * SHOULD implement Nagle algorithm. (does) 359 * 360 * TCP Connection Failures (4.2.3.5) 361 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 362 * SHOULD inform application layer of soft errors. (does) 363 * 364 * TCP Keep-Alives (4.2.3.6) 365 * MAY provide keep-alives. (does) 366 * MUST make keep-alives configurable on a per-connection basis. (does) 367 * MUST default to no keep-alives. (does) 368 * **MUST make keep-alive interval configurable. (doesn't) 369 * **MUST make default keep-alive interval > 2 hours. (doesn't) 370 * MUST NOT interpret failure to ACK keep-alive packet as dead 371 * connection. (doesn't) 372 * SHOULD send keep-alive with no data. (does) 373 * 374 * TCP Multihoming (4.2.3.7) 375 * MUST get source address from IP layer before sending first 376 * SYN. (does) 377 * MUST use same local address for all segments of a connection. (does) 378 * 379 * IP Options (4.2.3.8) 380 * MUST ignore unsupported IP options. (does) 381 * MAY support Time Stamp and Record Route. (does) 382 * MUST allow application to specify a source route. (does) 383 * MUST allow receieved Source Route option to set route for all future 384 * segments on this connection. (does not (security issues)) 385 * 386 * ICMP messages (4.2.3.9) 387 * MUST act on ICMP errors. (does) 388 * MUST slow transmission upon receipt of a Source Quench. (does) 389 * MUST NOT abort connection upon receipt of soft Destination 390 * Unreachables (0, 1, 5), Time Exceededs and Parameter 391 * Problems. (doesn't) 392 * SHOULD report soft Destination Unreachables etc. to the 393 * application. (does) 394 * SHOULD abort connection upon receipt of hard Destination Unreachable 395 * messages (2, 3, 4). (does) 396 * 397 * Remote Address Validation (4.2.3.10) 398 * MUST reject as an error OPEN for invalid remote IP address. (does) 399 * MUST ignore SYN with invalid source address. (does) 400 * MUST silently discard incoming SYN for broadcast/multicast 401 * address. (does) 402 * 403 * Asynchronous Reports (4.2.4.1) 404 * MUST provide mechanism for reporting soft errors to application 405 * layer. (does) 406 * 407 * Type of Service (4.2.4.2) 408 * MUST allow application layer to set Type of Service. (does IP_TOS) 409 * 410 * (Whew. -- MS 950903) 411 **/ 412
413 #include <linux/types.h>
414 #include <linux/sched.h>
415 #include <linux/mm.h>
416 #include <linux/time.h>
417 #include <linux/string.h>
418 #include <linux/config.h>
419 #include <linux/socket.h>
420 #include <linux/sockios.h>
421 #include <linux/termios.h>
422 #include <linux/in.h>
423 #include <linux/fcntl.h>
424 #include <linux/inet.h>
425 #include <linux/netdevice.h>
426 #include <net/snmp.h>
427 #include <net/ip.h>
428 #include <net/protocol.h>
429 #include <net/icmp.h>
430 #include <net/tcp.h>
431 #include <net/arp.h>
432 #include <linux/skbuff.h>
433 #include <net/sock.h>
434 #include <net/route.h>
435 #include <linux/errno.h>
436 #include <linux/timer.h>
437 #include <asm/system.h>
438 #include <asm/segment.h>
439 #include <linux/mm.h>
440 #include <net/checksum.h>
441
442 /* 443 * The MSL timer is the 'normal' timer. 444 */ 445
446 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
447
448 #define SEQ_TICK 3
449 unsignedlongseq_offset;
450 structtcp_mibtcp_statistics;
451
452 /* 453 * Cached last hit socket 454 */ 455
456 volatileunsignedlongth_cache_saddr,th_cache_daddr;
457 volatileunsignedshortth_cache_dport, th_cache_sport;
458 volatilestructsock *th_cache_sk;
459
460 voidtcp_cache_zap(void)
/* */ 461 { 462 unsignedlongflags;
463 save_flags(flags);
464 cli();
465 th_cache_saddr=0;
466 th_cache_daddr=0;
467 th_cache_dport=0;
468 th_cache_sport=0;
469 th_cache_sk=NULL;
470 restore_flags(flags);
471 } 472
473 staticvoidtcp_close(structsock *sk, inttimeout);
474 staticvoidtcp_read_wakeup(structsock *sk);
475
476 /* 477 * The less said about this the better, but it works and will do for 1.2 (and 1.4 ;)) 478 */ 479
480 staticstructwait_queue *master_select_wakeup;
481
482 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 483 { 484 if (a < b)
485 return(a);
486 return(b);
487 } 488
489 #undefSTATE_TRACE 490
491 #ifdefSTATE_TRACE 492 staticchar *statename[]={ 493 "Unused","Established","Syn Sent","Syn Recv",
494 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
495 "Close Wait","Last ACK","Listen","Closing"
496 };
497 #endif 498
499 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 500 { 501 if(sk->state==TCP_ESTABLISHED)
502 tcp_statistics.TcpCurrEstab--;
503 #ifdefSTATE_TRACE 504 if(sk->debug)
505 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
506 #endif 507 /* This is a hack but it doesn't occur often and it's going to 508 be a real to fix nicely */ 509
510 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
511 { 512 wake_up_interruptible(&master_select_wakeup);
513 } 514 sk->state=state;
515 if(state==TCP_ESTABLISHED)
516 tcp_statistics.TcpCurrEstab++;
517 if(sk->state==TCP_CLOSE)
518 tcp_cache_zap();
519 } 520
521 /* 522 * This routine picks a TCP windows for a socket based on 523 * the following constraints 524 * 525 * 1. The window can never be shrunk once it is offered (RFC 793) 526 * 2. We limit memory per socket 527 */ 528
529
530 static__inline__unsignedshorttcp_select_window(structsock *sk)
/* */ 531 { 532 longfree_space = sock_rspace(sk);
533 longwindow = 0;
534
535 if (free_space > 1024)
536 free_space &= ~0x3FF; /* make free space a multiple of 1024 */ 537
538 if(sk->window_clamp)
539 free_space = min(sk->window_clamp, free_space);
540
541 /* 542 * compute the actual window i.e. 543 * old_window - received_bytes_on_that_win 544 */ 545
546 if (sk->mss == 0)
547 sk->mss = sk->mtu;
548
549 window = sk->window - (sk->acked_seq - sk->lastwin_seq);
550
551 if ( window < 0 ) { 552 window = 0;
553 printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
554 sk->window, sk->acked_seq, sk->lastwin_seq);
555 } 556
557 /* 558 * RFC 1122: 559 * "the suggested [SWS] avoidance algoritm for the receiver is to keep 560 * RECV.NEXT + RCV.WIN fixed until: 561 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" 562 * 563 * i.e. don't raise the right edge of the window until you can't raise 564 * it MSS bytes 565 */ 566
567 if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
568 window += ((free_space - window) / sk->mss) * sk->mss;
569
570 sk->window = window;
571 sk->lastwin_seq = sk->acked_seq;
572
573 returnsk->window;
574 } 575
576 /* 577 * This function returns the amount that we can raise the 578 * usable window. 579 */ 580
581 static__inline__unsignedshorttcp_raise_window(structsock *sk)
/* */ 582 { 583 longfree_space = sock_rspace(sk);
584 longwindow = 0;
585
586 if (free_space > 1024)
587 free_space &= ~0x3FF; /* make free space a multiple of 1024 */ 588
589 if(sk->window_clamp)
590 free_space = min(sk->window_clamp, free_space);
591
592 /* 593 * compute the actual window i.e. 594 * old_window - received_bytes_on_that_win 595 */ 596
597 window = sk->window - (sk->acked_seq - sk->lastwin_seq);
598
599 if (sk->mss == 0)
600 sk->mss = sk->mtu;
601
602 if ( window < 0 ) { 603 window = 0;
604 printk(KERN_DEBUG "TRW: win < 0 w=%d 1=%u 2=%u\n",
605 sk->window, sk->acked_seq, sk->lastwin_seq);
606 } 607
608 if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
609 return ((free_space - window) / sk->mss) * sk->mss;
610
611 return 0;
612 } 613
614 /* 615 * Find someone to 'accept'. Must be called with 616 * sk->inuse=1 or cli() 617 */ 618
619 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 620 { 621 structsk_buff *p=skb_peek(&s->receive_queue);
622 if(p==NULL)
623 returnNULL;
624 do 625 { 626 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
627 returnp;
628 p=p->next;
629 } 630 while(p!=(structsk_buff *)&s->receive_queue);
631 returnNULL;
632 } 633
634 /* 635 * Remove a completed connection and return it. This is used by 636 * tcp_accept() to get connections from the queue. 637 */ 638
639 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 640 { 641 structsk_buff *skb;
642 unsignedlongflags;
643 save_flags(flags);
644 cli();
645 skb=tcp_find_established(s);
646 if(skb!=NULL)
647 skb_unlink(skb); /* Take it off the queue */ 648 restore_flags(flags);
649 returnskb;
650 } 651
652 /* 653 * This routine closes sockets which have been at least partially 654 * opened, but not yet accepted. Currently it is only called by 655 * tcp_close, and timeout mirrors the value there. 656 */ 657
658 staticvoidtcp_close_pending (structsock *sk)
/* */ 659 { 660 structsk_buff *skb;
661
662 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
663 { 664 skb->sk->dead=1;
665 tcp_close(skb->sk, 0);
666 kfree_skb(skb, FREE_READ);
667 } 668 return;
669 } 670
671 /* 672 * Enter the time wait state. 673 */ 674
675 staticvoidtcp_time_wait(structsock *sk)
/* */ 676 { 677 tcp_set_state(sk,TCP_TIME_WAIT);
678 sk->shutdown = SHUTDOWN_MASK;
679 if (!sk->dead)
680 sk->state_change(sk);
681 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
682 } 683
684 /* 685 * A socket has timed out on its send queue and wants to do a 686 * little retransmitting. Currently this means TCP. 687 */ 688
689 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 690 { 691 structsk_buff * skb;
692 structproto *prot;
693 structdevice *dev;
694 intct=0;
695 structrtable *rt;
696
697 prot = sk->prot;
698 skb = sk->send_head;
699
700 while (skb != NULL)
701 { 702 structtcphdr *th;
703 structiphdr *iph;
704 intsize;
705
706 dev = skb->dev;
707 IS_SKB(skb);
708 skb->when = jiffies;
709
710 /* dl1bke 960201 - @%$$! Hope this cures strange race conditions */ 711 /* with AX.25 mode VC. (esp. DAMA) */ 712 /* if the buffer is locked we should not retransmit */ 713 /* anyway, so we don't need all the fuss to prepare */ 714 /* the buffer in this case. */ 715 /* (the skb_pull() changes skb->data while we may */ 716 /* actually try to send the data. Ough. A side */ 717 /* effect is that we'll send some unnecessary data, */ 718 /* but the alternative is desastrous... */ 719
720 if (skb_device_locked(skb))
721 break;
722
723 /* 724 * Discard the surplus MAC header 725 */ 726
727 skb_pull(skb,((unsignedchar *)skb->ip_hdr)-skb->data);
728
729 /* 730 * In general it's OK just to use the old packet. However we 731 * need to use the current ack and window fields. Urg and 732 * urg_ptr could possibly stand to be updated as well, but we 733 * don't keep the necessary data. That shouldn't be a problem, 734 * if the other end is doing the right thing. Since we're 735 * changing the packet, we have to issue a new IP identifier. 736 */ 737
738 iph = (structiphdr *)skb->data;
739 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
740 size = ntohs(iph->tot_len) - (iph->ihl<<2);
741
742 /* 743 * Note: We ought to check for window limits here but 744 * currently this is done (less efficiently) elsewhere. 745 */ 746
747 /* 748 * Put a MAC header back on (may cause ARPing) 749 */ 750
751 { 752 /* ANK: UGLY, but the bug, that was here, should be fixed. 753 */ 754 structoptions * opt = (structoptions*)skb->proto_priv;
755 rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
756 } 757
758 iph->id = htons(ip_id_count++);
759 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 760 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
761 iph->frag_off &= ~htons(IP_DF);
762 #endif 763 ip_send_check(iph);
764
765 if (rt==NULL) /* Deep poo */ 766 { 767 if(skb->sk)
768 { 769 skb->sk->err_soft=ENETUNREACH;
770 skb->sk->error_report(skb->sk);
771 } 772 } 773 else 774 { 775 dev=rt->rt_dev;
776 skb->raddr=rt->rt_gateway;
777 skb->dev=dev;
778 skb->arp=1;
779 if (rt->rt_hh)
780 { 781 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
782 if (!rt->rt_hh->hh_uptodate)
783 { 784 skb->arp = 0;
785 #ifRT_CACHE_DEBUG >= 2
786 printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
787 #endif 788 } 789 } 790 elseif (dev->hard_header)
791 { 792 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
793 skb->arp=0;
794 } 795
796 /* 797 * This is not the right way to handle this. We have to 798 * issue an up to date window and ack report with this 799 * retransmit to keep the odd buggy tcp that relies on 800 * the fact BSD does this happy. 801 * We don't however need to recalculate the entire 802 * checksum, so someone wanting a small problem to play 803 * with might like to implement RFC1141/RFC1624 and speed 804 * this up by avoiding a full checksum. 805 */ 806
807 th->ack_seq = htonl(sk->acked_seq);
808 sk->ack_backlog = 0;
809 sk->bytes_rcv = 0;
810 th->window = ntohs(tcp_select_window(sk));
811 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
812
813 /* 814 * If the interface is (still) up and running, kick it. 815 */ 816
817 if (dev->flags & IFF_UP)
818 { 819 /* 820 * If the packet is still being sent by the device/protocol 821 * below then don't retransmit. This is both needed, and good - 822 * especially with connected mode AX.25 where it stops resends 823 * occurring of an as yet unsent anyway frame! 824 * We still add up the counts as the round trip time wants 825 * adjusting. 826 */ 827 if (sk && !skb_device_locked(skb))
828 { 829 /* Remove it from any existing driver queue first! */ 830 skb_unlink(skb);
831 /* Now queue it */ 832 ip_statistics.IpOutRequests++;
833 dev_queue_xmit(skb, dev, sk->priority);
834 } 835 } 836 } 837
838 /* 839 * Count retransmissions 840 */ 841
842 ct++;
843 sk->prot->retransmits ++;
844 tcp_statistics.TcpRetransSegs++;
845
846
847 /* 848 * Only one retransmit requested. 849 */ 850
851 if (!all)
852 break;
853
854 /* 855 * This should cut it off before we send too many packets. 856 */ 857
858 if (ct >= sk->cong_window)
859 break;
860 skb = skb->link3;
861 } 862 } 863
864 /* 865 * Reset the retransmission timer 866 */ 867
868 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 869 { 870 del_timer(&sk->retransmit_timer);
871 sk->ip_xmit_timeout = why;
872 if((long)when < 0)
873 { 874 when=3;
875 printk("Error: Negative timer in xmit_timer\n");
876 } 877 sk->retransmit_timer.expires=jiffies+when;
878 add_timer(&sk->retransmit_timer);
879 } 880
881 /* 882 * This is the normal code called for timeouts. It does the retransmission 883 * and then does backoff. tcp_do_retransmit is separated out because 884 * tcp_ack needs to send stuff from the retransmit queue without 885 * initiating a backoff. 886 */ 887
888
889 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 890 { 891 tcp_do_retransmit(sk, all);
892
893 /* 894 * Increase the timeout each time we retransmit. Note that 895 * we do not increase the rtt estimate. rto is initialized 896 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 897 * that doubling rto each time is the least we can get away with. 898 * In KA9Q, Karn uses this for the first few times, and then 899 * goes to quadratic. netBSD doubles, but only goes up to *64, 900 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 901 * defined in the protocol as the maximum possible RTT. I guess 902 * we'll have to use something other than TCP to talk to the 903 * University of Mars. 904 * 905 * PAWS allows us longer timeouts and large windows, so once 906 * implemented ftp to mars will work nicely. We will have to fix 907 * the 120 second clamps though! 908 */ 909
910 sk->retransmits++;
911 sk->prot->retransmits++;
912 sk->backoff++;
913 sk->rto = min(sk->rto << 1, 120*HZ);
914 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
915 } 916
917
918 /* 919 * A timer event has trigger a tcp retransmit timeout. The 920 * socket xmit queue is ready and set up to send. Because 921 * the ack receive code keeps the queue straight we do 922 * nothing clever here. 923 */ 924
925 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 926 { 927 if (all)
928 { 929 tcp_retransmit_time(sk, all);
930 return;
931 } 932
933 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 934 /* sk->ssthresh in theory can be zero. I guess that's OK */ 935 sk->cong_count = 0;
936
937 sk->cong_window = 1;
938
939 /* Do the actual retransmit. */ 940 tcp_retransmit_time(sk, all);
941 } 942
943 /* 944 * A write timeout has occurred. Process the after effects. 945 */ 946
947 staticinttcp_write_timeout(structsock *sk)
/* */ 948 { 949 /* 950 * Look for a 'soft' timeout. 951 */ 952 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
953 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
954 { 955 /* 956 * Attempt to recover if arp has changed (unlikely!) or 957 * a route has shifted (not supported prior to 1.3). 958 */ 959 ip_rt_advice(&sk->ip_route_cache, 0);
960 } 961
962 /* 963 * Have we tried to SYN too many times (repent repent 8)) 964 */ 965
966 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
967 { 968 if(sk->err_soft)
969 sk->err=sk->err_soft;
970 else 971 sk->err=ETIMEDOUT;
972 sk->error_report(sk);
973 del_timer(&sk->retransmit_timer);
974 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ 975 tcp_set_state(sk,TCP_CLOSE);
976 /* Don't FIN, we got nothing back */ 977 release_sock(sk);
978 return 0;
979 } 980 /* 981 * Has it gone just too far ? 982 */ 983 if (sk->retransmits > TCP_RETR2)
984 { 985 if(sk->err_soft)
986 sk->err = sk->err_soft;
987 else 988 sk->err = ETIMEDOUT;
989 sk->error_report(sk);
990 del_timer(&sk->retransmit_timer);
991 /* 992 * Time wait the socket 993 */ 994 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
995 { 996 tcp_set_state(sk,TCP_TIME_WAIT);
997 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
998 } 999 else1000 {1001 /*1002 * Clean up time.1003 */1004 tcp_set_state(sk, TCP_CLOSE);
1005 release_sock(sk);
1006 return 0;
1007 }1008 }1009 return 1;
1010 }1011
1012 /*1013 * The TCP retransmit timer. This lacks a few small details.1014 *1015 * 1. An initial rtt timeout on the probe0 should cause what we can1016 * of the first write queue buffer to be split and sent.1017 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report1018 * ETIMEDOUT if we know an additional 'soft' error caused this.1019 * tcp_err should save a 'soft error' for us.1020 */1021
1022 staticvoidretransmit_timer(unsignedlongdata)
/* */1023 {1024 structsock *sk = (structsock*)data;
1025 intwhy = sk->ip_xmit_timeout;
1026
1027 /*1028 * We are reset. We will send no more retransmits.1029 */1030
1031 if(sk->zapped)
1032 return;
1033
1034 /* 1035 * Only process if socket is not in use1036 */1037
1038 cli();
1039 if (sk->inuse || in_bh)
1040 {1041 /* Try again in 1 second */1042 sk->retransmit_timer.expires = jiffies+HZ;
1043 add_timer(&sk->retransmit_timer);
1044 sti();
1045 return;
1046 }1047
1048 sk->inuse = 1;
1049 sti();
1050
1051
1052 if (sk->ack_backlog && !sk->dead)
1053 sk->data_ready(sk,0);
1054
1055 /* Now we need to figure out why the socket was on the timer. */1056
1057 switch (why)
1058 {1059 /* Window probing */1060 caseTIME_PROBE0:
1061 tcp_send_probe0(sk);
1062 tcp_write_timeout(sk);
1063 break;
1064 /* Retransmitting */1065 caseTIME_WRITE:
1066 /* It could be we got here because we needed to send an ack.1067 * So we need to check for that.1068 */1069 {1070 structsk_buff *skb;
1071 unsignedlongflags;
1072
1073 save_flags(flags);
1074 cli();
1075 skb = sk->send_head;
1076 if (!skb)
1077 {1078 if (sk->ack_backlog)
1079 tcp_read_wakeup(sk);
1080 restore_flags(flags);
1081 }1082 else1083 {1084 /*1085 * Kicked by a delayed ack. Reset timer1086 * correctly now1087 */1088 if (jiffies < skb->when + sk->rto)
1089 {1090 if (sk->ack_backlog)
1091 tcp_read_wakeup(sk);
1092 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1093 restore_flags(flags);
1094 break;
1095 }1096 restore_flags(flags);
1097 /*1098 * Retransmission1099 */1100 sk->retransmits++;
1101 sk->prot->retransmits++;
1102 sk->prot->retransmit (sk, 0);
1103 tcp_write_timeout(sk);
1104 }1105 break;
1106 }1107 /* Sending Keepalives */1108 caseTIME_KEEPOPEN:
1109 /* 1110 * this reset_timer() call is a hack, this is not1111 * how KEEPOPEN is supposed to work.1112 */1113 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1114
1115 /* Send something to keep the connection open. */1116 if (sk->prot->write_wakeup)
1117 sk->prot->write_wakeup (sk);
1118 sk->retransmits++;
1119 sk->prot->retransmits++;
1120 tcp_write_timeout(sk);
1121 break;
1122 default:
1123 printk ("rexmit_timer: timer expired - reason unknown\n");
1124 break;
1125 }1126 release_sock(sk);
1127 }1128
1129 /*1130 * This routine is called by the ICMP module when it gets some1131 * sort of error condition. If err < 0 then the socket should1132 * be closed and the error returned to the user. If err > 01133 * it's just the icmp type << 8 | icmp code. After adjustment1134 * header points to the first 8 bytes of the tcp header. We need1135 * to find the appropriate port.1136 */1137
1138 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */1139 __u32saddr, structinet_protocol *protocol)
1140 {1141 structtcphdr *th = (structtcphdr *)header;
1142 structsock *sk;
1143
1144 /*1145 * This one is _WRONG_. FIXME urgently.1146 */1147 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1148 structiphdr *iph=(structiphdr *)(header-sizeof(structiphdr));
1149 #endif1150 th =(structtcphdr *)header;
1151 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1152
1153 if (sk == NULL)
1154 return;
1155
1156 if (type == ICMP_SOURCE_QUENCH)
1157 {1158 /*1159 * FIXME:1160 * For now we will just trigger a linear backoff.1161 * The slow start code should cause a real backoff here.1162 */1163 if (sk->cong_window > 4)
1164 sk->cong_window--;
1165 return;
1166 }1167
1168 if (type == ICMP_PARAMETERPROB)
1169 {1170 sk->err=EPROTO;
1171 sk->error_report(sk);
1172 }1173
1174 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1175 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1176 {1177 structrtable * rt;
1178 /*1179 * Ugly trick to pass MTU to protocol layer.1180 * Really we should add argument "info" to error handler.1181 */1182 unsignedshortnew_mtu = ntohs(iph->id);
1183
1184 if ((rt = sk->ip_route_cache) != NULL)
1185 if (rt->rt_mtu > new_mtu)
1186 rt->rt_mtu = new_mtu;
1187
1188 if (sk->mtu > new_mtu - sizeof(structiphdr) - sizeof(structtcphdr)
1189 && new_mtu > sizeof(structiphdr)+sizeof(structtcphdr))
1190 sk->mtu = new_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
1191
1192 return;
1193 }1194 #endif1195
1196 /*1197 * If we've already connected we will keep trying1198 * until we time out, or the user gives up.1199 */1200
1201 if (code < 13)
1202 {1203 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1204 {1205 sk->err = icmp_err_convert[code].errno;
1206 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1207 {1208 tcp_statistics.TcpAttemptFails++;
1209 tcp_set_state(sk,TCP_CLOSE);
1210 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */1211 }1212 }1213 else/* Only an error on timeout */1214 sk->err_soft = icmp_err_convert[code].errno;
1215 }1216 }1217
1218
1219 /*1220 * Walk down the receive queue counting readable data until we hit the end or we find a gap1221 * in the received data queue (ie a frame missing that needs sending to us). Not1222 * sorting using two queues as data arrives makes life so much harder.1223 */1224
1225 staticinttcp_readable(structsock *sk)
/* */1226 {1227 unsignedlongcounted;
1228 unsignedlongamount;
1229 structsk_buff *skb;
1230 intsum;
1231 unsignedlongflags;
1232
1233 if(sk && sk->debug)
1234 printk("tcp_readable: %p - ",sk);
1235
1236 save_flags(flags);
1237 cli();
1238 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1239 {1240 restore_flags(flags);
1241 if(sk && sk->debug)
1242 printk("empty\n");
1243 return(0);
1244 }1245
1246 counted = sk->copied_seq; /* Where we are at the moment */1247 amount = 0;
1248
1249 /* 1250 * Do until a push or until we are out of data. 1251 */1252
1253 do1254 {1255 if (before(counted, skb->seq)) /* Found a hole so stops here */1256 break;
1257 sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */1258 if (skb->h.th->syn)
1259 sum++;
1260 if (sum > 0)
1261 {/* Add it up, move on */1262 amount += sum;
1263 if (skb->h.th->syn)
1264 amount--;
1265 counted += sum;
1266 }1267 /*1268 * Don't count urg data ... but do it in the right place!1269 * Consider: "old_data (ptr is here) URG PUSH data"1270 * The old code would stop at the first push because1271 * it counted the urg (amount==1) and then does amount--1272 * *after* the loop. This means tcp_readable() always1273 * returned zero if any URG PUSH was in the queue, even1274 * though there was normal data available. If we subtract1275 * the urg data right here, we even get it to work for more1276 * than one URG PUSH skb without normal data.1277 * This means that select() finally works now with urg data1278 * in the queue. Note that rlogin was never affected1279 * because it doesn't use select(); it uses two processes1280 * and a blocking read(). And the queue scan in tcp_read()1281 * was correct. Mike <pall@rz.uni-karlsruhe.de>1282 */1283 if (skb->h.th->urg)
1284 amount--; /* don't count urg data */1285 if (amount && skb->h.th->psh) break;
1286 skb = skb->next;
1287 }1288 while(skb != (structsk_buff *)&sk->receive_queue);
1289
1290 restore_flags(flags);
1291 if(sk->debug)
1292 printk("got %lu bytes.\n",amount);
1293 return(amount);
1294 }1295
1296 /*1297 * LISTEN is a special case for select..1298 */1299 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */1300 {1301 if (sel_type == SEL_IN) {1302 intretval;
1303
1304 sk->inuse = 1;
1305 retval = (tcp_find_established(sk) != NULL);
1306 release_sock(sk);
1307 if (!retval)
1308 select_wait(&master_select_wakeup,wait);
1309 returnretval;
1310 }1311 return 0;
1312 }1313
1314
1315 /*1316 * Wait for a TCP event.1317 *1318 * Note that we don't need to set "sk->inuse", as the upper select layers1319 * take care of normal races (between the test and the event) and we don't1320 * go look at any of the socket buffers directly.1321 */1322 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */1323 {1324 if (sk->state == TCP_LISTEN)
1325 returntcp_listen_select(sk, sel_type, wait);
1326
1327 switch(sel_type) {1328 caseSEL_IN:
1329 if (sk->err)
1330 return 1;
1331 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1332 break;
1333
1334 if (sk->shutdown & RCV_SHUTDOWN)
1335 return 1;
1336
1337 if (sk->acked_seq == sk->copied_seq)
1338 break;
1339
1340 if (sk->urg_seq != sk->copied_seq ||
1341 sk->acked_seq != sk->copied_seq+1 ||
1342 sk->urginline || !sk->urg_data)
1343 return 1;
1344 break;
1345
1346 caseSEL_OUT:
1347 if (sk->err)
1348 return 1;
1349 if (sk->shutdown & SEND_SHUTDOWN)
1350 return 0;
1351 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1352 break;
1353 /*1354 * This is now right thanks to a small fix1355 * by Matt Dillon.1356 */1357
1358 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1359 break;
1360 return 1;
1361
1362 caseSEL_EX:
1363 if (sk->urg_data)
1364 return 1;
1365 break;
1366 }1367 select_wait(sk->sleep, wait);
1368 return 0;
1369 }1370
1371 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */1372 {1373 interr;
1374 switch(cmd)
1375 {1376
1377 caseTIOCINQ:
1378 #ifdef FIXME /* FIXME: */1379 caseFIONREAD:
1380 #endif1381 {1382 unsignedlongamount;
1383
1384 if (sk->state == TCP_LISTEN)
1385 return(-EINVAL);
1386
1387 sk->inuse = 1;
1388 amount = tcp_readable(sk);
1389 release_sock(sk);
1390 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1391 if(err)
1392 returnerr;
1393 put_user(amount, (int *)arg);
1394 return(0);
1395 }1396 caseSIOCATMARK:
1397 {1398 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1399
1400 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1401 if (err)
1402 returnerr;
1403 put_user(answ,(int *) arg);
1404 return(0);
1405 }1406 caseTIOCOUTQ:
1407 {1408 unsignedlongamount;
1409
1410 if (sk->state == TCP_LISTEN) return(-EINVAL);
1411 amount = sock_wspace(sk);
1412 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1413 if(err)
1414 returnerr;
1415 put_user(amount, (int *)arg);
1416 return(0);
1417 }1418 default:
1419 return(-EINVAL);
1420 }1421 }1422
1423
1424 /*1425 * This routine computes a TCP checksum. 1426 *1427 * Modified January 1995 from a go-faster DOS routine by1428 * Jorge Cwik <jorge@laser.satlink.net>1429 */1430
1431 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1432 unsignedlongsaddr, unsignedlongdaddr, unsignedlongbase)
1433 {1434 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1435 }1436
1437 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1438 unsignedlongdaddr, intlen, structsock *sk)
1439 {1440 th->check = 0;
1441 th->check = tcp_check(th, len, saddr, daddr,
1442 csum_partial((char *)th,len,0));
1443 return;
1444 }1445
1446 /*1447 * This is the main buffer sending routine. We queue the buffer1448 * having checked it is sane seeming.1449 */1450
1451 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1452 {1453 intsize;
1454 structtcphdr * th = skb->h.th;
1455
1456 /*1457 * length of packet (not counting length of pre-tcp headers) 1458 */1459
1460 size = skb->len - ((unsignedchar *) th - skb->data);
1461
1462 /*1463 * Sanity check it.. 1464 */1465
1466 if (size < sizeof(structtcphdr) || size > skb->len)
1467 {1468 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1469 skb, skb->data, th, skb->len);
1470 kfree_skb(skb, FREE_WRITE);
1471 return;
1472 }1473
1474 /*1475 * If we have queued a header size packet.. (these crash a few1476 * tcp stacks if ack is not set)1477 */1478
1479 if (size == sizeof(structtcphdr))
1480 {1481 /* If it's got a syn or fin it's notionally included in the size..*/1482 if(!th->syn && !th->fin)
1483 {1484 printk("tcp_send_skb: attempt to queue a bogon.\n");
1485 kfree_skb(skb,FREE_WRITE);
1486 return;
1487 }1488 }1489
1490 /*1491 * Actual processing.1492 */1493
1494 tcp_statistics.TcpOutSegs++;
1495 skb->seq = ntohl(th->seq);
1496 skb->end_seq = skb->seq + size - 4*th->doff;
1497
1498 /*1499 * We must queue if1500 *1501 * a) The right edge of this frame exceeds the window1502 * b) We are retransmitting (Nagle's rule)1503 * c) We have too many packets 'in flight'1504 */1505
1506 if (after(skb->end_seq, sk->window_seq) ||
1507 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1508 sk->packets_out >= sk->cong_window)
1509 {1510 /* checksum will be supplied by tcp_write_xmit. So1511 * we shouldn't need to set it at all. I'm being paranoid */1512 th->check = 0;
1513 if (skb->next != NULL)
1514 {1515 printk("tcp_send_partial: next != NULL\n");
1516 skb_unlink(skb);
1517 }1518 skb_queue_tail(&sk->write_queue, skb);
1519
1520 /*1521 * If we don't fit we have to start the zero window1522 * probes. This is broken - we really need to do a partial1523 * send _first_ (This is what causes the Cisco and PC/TCP1524 * grief).1525 */1526
1527 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1528 sk->send_head == NULL && sk->ack_backlog == 0)
1529 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1530 }1531 else1532 {1533 /*1534 * This is going straight out1535 */1536
1537 th->ack_seq = htonl(sk->acked_seq);
1538 th->window = htons(tcp_select_window(sk));
1539
1540 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1541
1542 sk->sent_seq = sk->write_seq;
1543
1544 /*1545 * This is mad. The tcp retransmit queue is put together1546 * by the ip layer. This causes half the problems with1547 * unroutable FIN's and other things.1548 */1549
1550 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1551
1552
1553 sk->ack_backlog = 0;
1554 sk->bytes_rcv = 0;
1555
1556 /*1557 * Set for next retransmit based on expected ACK time.1558 * FIXME: We set this every time which means our 1559 * retransmits are really about a window behind.1560 */1561
1562 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1563 }1564 }1565
1566 /*1567 * Locking problems lead us to a messy situation where we can have1568 * multiple partially complete buffers queued up. This is really bad1569 * as we don't want to be sending partial buffers. Fix this with1570 * a semaphore or similar to lock tcp_write per socket.1571 *1572 * These routines are pretty self descriptive.1573 */1574
1575 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1576 {1577 structsk_buff * skb;
1578 unsignedlongflags;
1579
1580 save_flags(flags);
1581 cli();
1582 skb = sk->partial;
1583 if (skb) {1584 sk->partial = NULL;
1585 del_timer(&sk->partial_timer);
1586 }1587 restore_flags(flags);
1588 returnskb;
1589 }1590
1591 /*1592 * Empty the partial queue1593 */1594
1595 staticvoidtcp_send_partial(structsock *sk)
/* */1596 {1597 structsk_buff *skb;
1598
1599 if (sk == NULL)
1600 return;
1601 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1602 tcp_send_skb(sk, skb);
1603 }1604
1605 /*1606 * Queue a partial frame1607 */1608
1609 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1610 {1611 structsk_buff * tmp;
1612 unsignedlongflags;
1613
1614 save_flags(flags);
1615 cli();
1616 tmp = sk->partial;
1617 if (tmp)
1618 del_timer(&sk->partial_timer);
1619 sk->partial = skb;
1620 init_timer(&sk->partial_timer);
1621 /*1622 * Wait up to 1 second for the buffer to fill.1623 */1624 sk->partial_timer.expires = jiffies+HZ;
1625 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1626 sk->partial_timer.data = (unsignedlong) sk;
1627 add_timer(&sk->partial_timer);
1628 restore_flags(flags);
1629 if (tmp)
1630 tcp_send_skb(sk, tmp);
1631 }1632
1633
1634
1635 /*1636 * This routine sends an ack and also updates the window. 1637 */1638
1639 staticvoidtcp_send_ack(u32sequence, u32ack,
/* */1640 structsock *sk,
1641 structtcphdr *th, unsignedlongdaddr)
1642 {1643 structsk_buff *buff;
1644 structtcphdr *t1;
1645 structdevice *dev = NULL;
1646 inttmp;
1647
1648 if(sk->zapped)
1649 return; /* We have been reset, we may not send again */1650
1651 /*1652 * We need to grab some memory, and put together an ack,1653 * and then put it into the queue to be sent.1654 */1655
1656 buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1657 if (buff == NULL)
1658 {1659 /* 1660 * Force it to send an ack. We don't have to do this1661 * (ACK is unreliable) but it's much better use of 1662 * bandwidth on slow links to send a spare ack than1663 * resend packets. 1664 */1665
1666 sk->ack_backlog++;
1667 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1668 {1669 reset_xmit_timer(sk, TIME_WRITE, HZ);
1670 }1671 return;
1672 }1673
1674 /*1675 * Assemble a suitable TCP frame1676 */1677
1678 buff->sk = sk;
1679 buff->localroute = sk->localroute;
1680
1681 /* 1682 * Put in the IP header and routing stuff. 1683 */1684
1685 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1686 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1687 if (tmp < 0)
1688 {1689 buff->free = 1;
1690 sock_wfree(sk, buff);
1691 return;
1692 }1693 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1694
1695 memcpy(t1, th, sizeof(*t1));
1696
1697 /*1698 * Swap the send and the receive. 1699 */1700
1701 t1->dest = th->source;
1702 t1->source = th->dest;
1703 t1->seq = ntohl(sequence);
1704 t1->ack = 1;
1705 sk->window = tcp_select_window(sk);
1706 t1->window = ntohs(sk->window);
1707 t1->res1 = 0;
1708 t1->res2 = 0;
1709 t1->rst = 0;
1710 t1->urg = 0;
1711 t1->syn = 0;
1712 t1->psh = 0;
1713 t1->fin = 0;
1714
1715 /*1716 * If we have nothing queued for transmit and the transmit timer1717 * is on we are just doing an ACK timeout and need to switch1718 * to a keepalive.1719 */1720
1721 if (ack == sk->acked_seq) {1722 sk->ack_backlog = 0;
1723 sk->bytes_rcv = 0;
1724 sk->ack_timed = 0;
1725
1726 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1727 && sk->ip_xmit_timeout == TIME_WRITE)
1728 if(sk->keepopen)
1729 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1730 else1731 delete_timer(sk);
1732 }1733
1734 /*1735 * Fill in the packet and send it1736 */1737
1738 t1->ack_seq = htonl(ack);
1739 t1->doff = sizeof(*t1)/4;
1740 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1741 if (sk->debug)
1742 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1743 tcp_statistics.TcpOutSegs++;
1744 sk->prot->queue_xmit(sk, dev, buff, 1);
1745 }1746
1747
1748 /* 1749 * This routine builds a generic TCP header. 1750 */1751
1752 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1753 {1754
1755 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1756 th->seq = htonl(sk->write_seq);
1757 th->psh =(push == 0) ? 1 : 0;
1758 th->doff = sizeof(*th)/4;
1759 th->ack = 1;
1760 th->fin = 0;
1761 sk->ack_backlog = 0;
1762 sk->bytes_rcv = 0;
1763 sk->ack_timed = 0;
1764 th->ack_seq = htonl(sk->acked_seq);
1765 sk->window = tcp_select_window(sk);
1766 th->window = htons(sk->window);
1767
1768 return(sizeof(*th));
1769 }1770
1771 /*1772 * This routine copies from a user buffer into a socket,1773 * and starts the transmit system.1774 */1775
1776 staticinttcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */1777 intlen, intnonblock, intflags)
1778 {1779 intcopied = 0;
1780 intcopy;
1781 inttmp;
1782 intseglen;
1783 intiovct=0;
1784 structsk_buff *skb;
1785 structsk_buff *send_tmp;
1786 structproto *prot;
1787 structdevice *dev = NULL;
1788 unsignedchar *from;
1789
1790 /*1791 * Do sanity checking for sendmsg/sendto/send1792 */1793
1794 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1795 return -EINVAL;
1796 if (msg->msg_name)
1797 {1798 structsockaddr_in *addr=(structsockaddr_in *)msg->msg_name;
1799 if(sk->state == TCP_CLOSE)
1800 return -ENOTCONN;
1801 if (msg->msg_namelen < sizeof(*addr))
1802 return -EINVAL;
1803 if (addr->sin_family && addr->sin_family != AF_INET)
1804 return -EINVAL;
1805 if (addr->sin_port != sk->dummy_th.dest)
1806 return -EISCONN;
1807 if (addr->sin_addr.s_addr != sk->daddr)
1808 return -EISCONN;
1809 }1810
1811 /*1812 * Ok commence sending1813 */1814
1815 while(iovct<msg->msg_iovlen)
1816 {1817 seglen=msg->msg_iov[iovct].iov_len;
1818 from=msg->msg_iov[iovct++].iov_base;
1819 sk->inuse=1;
1820 prot = sk->prot;
1821 while(seglen > 0)
1822 {1823 if (sk->err)
1824 {/* Stop on an error */1825 release_sock(sk);
1826 if (copied)
1827 return(copied);
1828 returnsock_error(sk);
1829 }1830
1831 /*1832 * First thing we do is make sure that we are established. 1833 */1834
1835 if (sk->shutdown & SEND_SHUTDOWN)
1836 {1837 release_sock(sk);
1838 sk->err = EPIPE;
1839 if (copied)
1840 return(copied);
1841 sk->err = 0;
1842 return(-EPIPE);
1843 }1844
1845 /* 1846 * Wait for a connection to finish.1847 */1848
1849 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1850 {1851 if (sk->err)
1852 {1853 release_sock(sk);
1854 if (copied)
1855 return(copied);
1856 returnsock_error(sk);
1857 }1858
1859 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1860 {1861 release_sock(sk);
1862 if (copied)
1863 return(copied);
1864
1865 if (sk->err)
1866 returnsock_error(sk);
1867
1868 if (sk->keepopen)
1869 {1870 send_sig(SIGPIPE, current, 0);
1871 }1872 return(-EPIPE);
1873 }1874
1875 if (nonblock || copied)
1876 {1877 release_sock(sk);
1878 if (copied)
1879 return(copied);
1880 return(-EAGAIN);
1881 }1882
1883 release_sock(sk);
1884 cli();
1885
1886 if (sk->state != TCP_ESTABLISHED &&
1887 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1888 {1889 interruptible_sleep_on(sk->sleep);
1890 if (current->signal & ~current->blocked)
1891 {1892 sti();
1893 if (copied)
1894 return(copied);
1895 return(-ERESTARTSYS);
1896 }1897 }1898 sk->inuse = 1;
1899 sti();
1900 }1901
1902 /*1903 * The following code can result in copy <= if sk->mss is ever1904 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1905 * sk->mtu is constant once SYN processing is finished. I.e. we1906 * had better not get here until we've seen his SYN and at least one1907 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1908 * But ESTABLISHED should guarantee that. sk->max_window is by definition1909 * non-decreasing. Note that any ioctl to set user_mss must be done1910 * before the exchange of SYN's. If the initial ack from the other1911 * end has a window of 0, max_window and thus mss will both be 0.1912 */1913
1914 /* 1915 * Now we need to check if we have a half built packet. 1916 */1917 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1918 /*1919 * FIXME: I'm almost sure that this fragment is BUG,1920 * but it works... I do not know why 8) --ANK1921 *1922 * Really, we should rebuild all the queues...1923 * It's difficult. Temprorary hack is to send all1924 * queued segments with allowed fragmentation.1925 */1926 {1927 intnew_mss = min(sk->mtu, sk->max_window);
1928 if (new_mss < sk->mss)
1929 {1930 tcp_send_partial(sk);
1931 sk->mss = new_mss;
1932 }1933 }1934 #endif1935
1936 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1937 {1938 inthdrlen;
1939
1940 /* IP header + TCP header */1941 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1942 + sizeof(structtcphdr);
1943
1944 /* Add more stuff to the end of skb->len */1945 if (!(flags & MSG_OOB))
1946 {1947 copy = min(sk->mss - (skb->len - hdrlen), seglen);
1948 if (copy <= 0)
1949 {1950 printk("TCP: **bug**: \"copy\" <= 0\n");
1951 return -EFAULT;
1952 }1953 memcpy_fromfs(skb_put(skb,copy), from, copy);
1954 from += copy;
1955 copied += copy;
1956 len -= copy;
1957 sk->write_seq += copy;
1958 seglen -= copy;
1959 }1960 if ((skb->len - hdrlen) >= sk->mss ||
1961 (flags & MSG_OOB) || !sk->packets_out)
1962 tcp_send_skb(sk, skb);
1963 else1964 tcp_enqueue_partial(skb, sk);
1965 continue;
1966 }1967
1968 /*1969 * We also need to worry about the window.1970 * If window < 1/2 the maximum window we've seen from this1971 * host, don't use it. This is sender side1972 * silly window prevention, as specified in RFC1122.1973 * (Note that this is different than earlier versions of1974 * SWS prevention, e.g. RFC813.). What we actually do is 1975 * use the whole MSS. Since the results in the right1976 * edge of the packet being outside the window, it will1977 * be queued for later rather than sent.1978 */1979
1980 copy = sk->window_seq - sk->write_seq;
1981 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1982 copy = sk->mss;
1983 if (copy > seglen)
1984 copy = seglen;
1985
1986 /*1987 * We should really check the window here also. 1988 */1989
1990 send_tmp = NULL;
1991 if (copy < sk->mss && !(flags & MSG_OOB))
1992 {1993 /*1994 * We will release the socket in case we sleep here. 1995 */1996 release_sock(sk);
1997 /*1998 * NB: following must be mtu, because mss can be increased.1999 * mss is always <= mtu 2000 */2001 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
2002 sk->inuse = 1;
2003 send_tmp = skb;
2004 }2005 else2006 {2007 /*2008 * We will release the socket in case we sleep here. 2009 */2010 release_sock(sk);
2011 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
2012 sk->inuse = 1;
2013 }2014
2015 /*2016 * If we didn't get any memory, we need to sleep. 2017 */2018
2019 if (skb == NULL)
2020 {2021 sk->socket->flags |= SO_NOSPACE;
2022 if (nonblock)
2023 {2024 release_sock(sk);
2025 if (copied)
2026 return(copied);
2027 return(-EAGAIN);
2028 }2029
2030 /*2031 * FIXME: here is another race condition. 2032 */2033
2034 tmp = sk->wmem_alloc;
2035 release_sock(sk);
2036 cli();
2037 /*2038 * Again we will try to avoid it. 2039 */2040 if (tmp <= sk->wmem_alloc &&
2041 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
2042 && sk->err == 0)
2043 {2044 sk->socket->flags &= ~SO_NOSPACE;
2045 interruptible_sleep_on(sk->sleep);
2046 if (current->signal & ~current->blocked)
2047 {2048 sti();
2049 if (copied)
2050 return(copied);
2051 return(-ERESTARTSYS);
2052 }2053 }2054 sk->inuse = 1;
2055 sti();
2056 continue;
2057 }2058
2059 skb->sk = sk;
2060 skb->free = 0;
2061 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
2062
2063 /*2064 * FIXME: we need to optimize this.2065 * Perhaps some hints here would be good.2066 */2067
2068 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
2069 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2070 if (tmp < 0 )
2071 {2072 sock_wfree(sk, skb);
2073 release_sock(sk);
2074 if (copied)
2075 return(copied);
2076 return(tmp);
2077 }2078 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY2079 skb->ip_hdr->frag_off |= htons(IP_DF);
2080 #endif2081 skb->dev = dev;
2082 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
2083 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
2084 if (tmp < 0)
2085 {2086 sock_wfree(sk, skb);
2087 release_sock(sk);
2088 if (copied)
2089 return(copied);
2090 return(tmp);
2091 }2092
2093 if (flags & MSG_OOB)
2094 {2095 skb->h.th->urg = 1;
2096 skb->h.th->urg_ptr = ntohs(copy);
2097 }2098
2099 memcpy_fromfs(skb_put(skb,copy), from, copy);
2100
2101 from += copy;
2102 copied += copy;
2103 len -= copy;
2104 seglen -= copy;
2105 skb->free = 0;
2106 sk->write_seq += copy;
2107
2108 if (send_tmp != NULL && sk->packets_out)
2109 {2110 tcp_enqueue_partial(send_tmp, sk);
2111 continue;
2112 }2113 tcp_send_skb(sk, skb);
2114 }2115 }2116 sk->err = 0;
2117
2118 /*2119 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly2120 * interactive fast network servers. It's meant to be on and2121 * it really improves the throughput though not the echo time2122 * on my slow slip link - Alan2123 */2124
2125 /*2126 * Avoid possible race on send_tmp - c/o Johannes Stille 2127 */2128
2129 if(sk->partial && ((!sk->packets_out)
2130 /* If not nagling we can send on the before case too.. */2131 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2132 ))
2133 tcp_send_partial(sk);
2134
2135 release_sock(sk);
2136 return(copied);
2137 }2138
2139 /*2140 * Send an ack if one is backlogged at this point. Ought to merge2141 * this with tcp_send_ack().2142 * This is called for delayed acks also.2143 */2144
2145 staticvoidtcp_read_wakeup(structsock *sk)
/* */2146 {2147 inttmp;
2148 structdevice *dev = NULL;
2149 structtcphdr *t1;
2150 structsk_buff *buff;
2151
2152 if (!sk->ack_backlog)
2153 return;
2154
2155 /*2156 * If we're closed, don't send an ack, or we'll get a RST2157 * from the closed destination.2158 */2159 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2160 return;
2161
2162 /*2163 * FIXME: we need to put code here to prevent this routine from2164 * being called. Being called once in a while is ok, so only check2165 * if this is the second time in a row.2166 */2167
2168 /*2169 * We need to grab some memory, and put together an ack,2170 * and then put it into the queue to be sent.2171 */2172
2173 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2174 if (buff == NULL)
2175 {2176 /* Try again real soon. */2177 reset_xmit_timer(sk, TIME_WRITE, HZ);
2178 return;
2179 }2180
2181 buff->sk = sk;
2182 buff->localroute = sk->localroute;
2183
2184 /*2185 * Put in the IP header and routing stuff. 2186 */2187
2188 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2189 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2190 if (tmp < 0)
2191 {2192 buff->free = 1;
2193 sock_wfree(sk, buff);
2194 return;
2195 }2196
2197 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2198
2199 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2200 t1->seq = htonl(sk->sent_seq);
2201 t1->ack = 1;
2202 t1->res1 = 0;
2203 t1->res2 = 0;
2204 t1->rst = 0;
2205 t1->urg = 0;
2206 t1->syn = 0;
2207 t1->psh = 0;
2208
2209
2210 sk->ack_backlog = 0;
2211 sk->bytes_rcv = 0;
2212
2213 sk->window = tcp_select_window(sk);
2214 t1->window = htons(sk->window);
2215 t1->ack_seq = htonl(sk->acked_seq);
2216 t1->doff = sizeof(*t1)/4;
2217 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2218 sk->prot->queue_xmit(sk, dev, buff, 1);
2219 tcp_statistics.TcpOutSegs++;
2220 }2221
2222
2223 /*2224 * FIXME:2225 * This routine frees used buffers.2226 * It should consider sending an ACK to let the2227 * other end know we now have a bigger window.2228 */2229
2230 staticvoidcleanup_rbuf(structsock *sk)
/* */2231 {2232 unsignedlongflags;
2233 unsignedlongleft;
2234 structsk_buff *skb;
2235 unsignedlongrspace;
2236
2237 if(sk->debug)
2238 printk("cleaning rbuf for sk=%p\n", sk);
2239
2240 save_flags(flags);
2241 cli();
2242
2243 left = sock_rspace(sk);
2244
2245 /*2246 * We have to loop through all the buffer headers,2247 * and try to free up all the space we can.2248 */2249
2250 while((skb=skb_peek(&sk->receive_queue)) != NULL)
2251 {2252 if (!skb->used || skb->users)
2253 break;
2254 skb_unlink(skb);
2255 skb->sk = sk;
2256 kfree_skb(skb, FREE_READ);
2257 }2258
2259 restore_flags(flags);
2260
2261 /*2262 * FIXME:2263 * At this point we should send an ack if the difference2264 * in the window, and the amount of space is bigger than2265 * TCP_WINDOW_DIFF.2266 */2267
2268 if(sk->debug)
2269 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2270 left);
2271 if ((rspace=sock_rspace(sk)) != left)
2272 {2273 /*2274 * This area has caused the most trouble. The current strategy2275 * is to simply do nothing if the other end has room to send at2276 * least 3 full packets, because the ack from those will auto-2277 * matically update the window. If the other end doesn't think2278 * we have much space left, but we have room for at least 1 more2279 * complete packet than it thinks we do, we will send an ack2280 * immediately. Otherwise we will wait up to .5 seconds in case2281 * the user reads some more.2282 */2283 sk->ack_backlog++;
2284 /*2285 * It's unclear whether to use sk->mtu or sk->mss here. They differ only2286 * if the other end is offering a window smaller than the agreed on MSS2287 * (called sk->mtu here). In theory there's no connection between send2288 * and receive, and so no reason to think that they're going to send2289 * small packets. For the moment I'm using the hack of reducing the mss2290 * only on the send side, so I'm putting mtu here.2291 */2292
2293 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2294 {2295 /* Send an ack right now. */2296 tcp_read_wakeup(sk);
2297 }2298 else2299 {2300 /* Force it to send an ack soon. */2301 intwas_active = del_timer(&sk->retransmit_timer);
2302 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2303 {2304 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2305 }2306 else2307 add_timer(&sk->retransmit_timer);
2308 }2309 }2310 }2311
2312
2313 /*2314 * Handle reading urgent data. BSD has very simple semantics for2315 * this, no blocking and very strange errors 8)2316 */2317
2318 staticinttcp_recv_urg(structsock * sk, intnonblock,
/* */2319 structmsghdr *msg, intlen, intflags, int *addr_len)
2320 {2321 /*2322 * No URG data to read2323 */2324 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2325 return -EINVAL; /* Yes this is right ! */2326
2327 if (sk->err)
2328 returnsock_error(sk);
2329
2330 if (sk->state == TCP_CLOSE || sk->done)
2331 {2332 if (!sk->done)
2333 {2334 sk->done = 1;
2335 return 0;
2336 }2337 return -ENOTCONN;
2338 }2339
2340 if (sk->shutdown & RCV_SHUTDOWN)
2341 {2342 sk->done = 1;
2343 return 0;
2344 }2345 sk->inuse = 1;
2346 if (sk->urg_data & URG_VALID)
2347 {2348 charc = sk->urg_data;
2349 if (!(flags & MSG_PEEK))
2350 sk->urg_data = URG_READ;
2351 memcpy_toiovec(msg->msg_iov, &c, 1);
2352 if(msg->msg_name)
2353 {2354 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2355 sin->sin_family=AF_INET;
2356 sin->sin_addr.s_addr=sk->daddr;
2357 sin->sin_port=sk->dummy_th.dest;
2358 }2359 if(addr_len)
2360 *addr_len=sizeof(structsockaddr_in);
2361 release_sock(sk);
2362 return 1;
2363 }2364 release_sock(sk);
2365
2366 /*2367 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2368 * the available implementations agree in this case:2369 * this call should never block, independent of the2370 * blocking state of the socket.2371 * Mike <pall@rz.uni-karlsruhe.de>2372 */2373 return -EAGAIN;
2374 }2375
2376
2377 /*2378 * This routine copies from a sock struct into the user buffer. 2379 */2380
2381 staticinttcp_recvmsg(structsock *sk, structmsghdr *msg,
/* */2382 intlen, intnonblock, intflags, int *addr_len)
2383 {2384 structwait_queuewait = {current, NULL};
2385 intcopied = 0;
2386 u32peek_seq;
2387 volatileu32 *seq; /* So gcc doesn't overoptimise */2388 unsignedlongused;
2389
2390 /* 2391 * This error should be checked. 2392 */2393
2394 if (sk->state == TCP_LISTEN)
2395 return -ENOTCONN;
2396
2397 /*2398 * Urgent data needs to be handled specially. 2399 */2400
2401 if (flags & MSG_OOB)
2402 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2403
2404 /*2405 * Copying sequence to update. This is volatile to handle2406 * the multi-reader case neatly (memcpy_to/fromfs might be 2407 * inline and thus not flush cached variables otherwise).2408 */2409
2410 peek_seq = sk->copied_seq;
2411 seq = &sk->copied_seq;
2412 if (flags & MSG_PEEK)
2413 seq = &peek_seq;
2414
2415 add_wait_queue(sk->sleep, &wait);
2416 sk->inuse = 1;
2417 while (len > 0)
2418 {2419 structsk_buff * skb;
2420 u32offset;
2421
2422 /*2423 * Are we at urgent data? Stop if we have read anything.2424 */2425
2426 if (copied && sk->urg_data && sk->urg_seq == *seq)
2427 break;
2428
2429 /*2430 * Next get a buffer.2431 */2432
2433 current->state = TASK_INTERRUPTIBLE;
2434
2435 skb = skb_peek(&sk->receive_queue);
2436 do2437 {2438 if (!skb)
2439 break;
2440 if (before(*seq, skb->seq))
2441 break;
2442 offset = *seq - skb->seq;
2443 if (skb->h.th->syn)
2444 offset--;
2445 if (offset < skb->len)
2446 gotofound_ok_skb;
2447 if (skb->h.th->fin)
2448 gotofound_fin_ok;
2449 if (!(flags & MSG_PEEK))
2450 skb->used = 1;
2451 skb = skb->next;
2452 }2453 while (skb != (structsk_buff *)&sk->receive_queue);
2454
2455 if (copied)
2456 break;
2457
2458 if (sk->err)
2459 {2460 copied = sock_error(sk);
2461 break;
2462 }2463
2464 if (sk->state == TCP_CLOSE)
2465 {2466 if (!sk->done)
2467 {2468 sk->done = 1;
2469 break;
2470 }2471 copied = -ENOTCONN;
2472 break;
2473 }2474
2475 if (sk->shutdown & RCV_SHUTDOWN)
2476 {2477 sk->done = 1;
2478 break;
2479 }2480
2481 if (nonblock)
2482 {2483 copied = -EAGAIN;
2484 break;
2485 }2486
2487 cleanup_rbuf(sk);
2488 release_sock(sk);
2489 sk->socket->flags |= SO_WAITDATA;
2490 schedule();
2491 sk->socket->flags &= ~SO_WAITDATA;
2492 sk->inuse = 1;
2493
2494 if (current->signal & ~current->blocked)
2495 {2496 copied = -ERESTARTSYS;
2497 break;
2498 }2499 continue;
2500
2501 found_ok_skb:
2502 /*2503 * Lock the buffer. We can be fairly relaxed as2504 * an interrupt will never steal a buffer we are 2505 * using unless I've missed something serious in2506 * tcp_data.2507 */2508
2509 skb->users++;
2510
2511 /*2512 * Ok so how much can we use ? 2513 */2514
2515 used = skb->len - offset;
2516 if (len < used)
2517 used = len;
2518 /*2519 * Do we have urgent data here? 2520 */2521
2522 if (sk->urg_data)
2523 {2524 u32urg_offset = sk->urg_seq - *seq;
2525 if (urg_offset < used)
2526 {2527 if (!urg_offset)
2528 {2529 if (!sk->urginline)
2530 {2531 ++*seq;
2532 offset++;
2533 used--;
2534 }2535 }2536 else2537 used = urg_offset;
2538 }2539 }2540
2541 /*2542 * Copy it - We _MUST_ update *seq first so that we2543 * don't ever double read when we have dual readers2544 */2545
2546 *seq += used;
2547
2548 /*2549 * This memcpy_tofs can sleep. If it sleeps and we2550 * do a second read it relies on the skb->users to avoid2551 * a crash when cleanup_rbuf() gets called.2552 */2553
2554 memcpy_toiovec(msg->msg_iov,((unsignedchar *)skb->h.th) +
2555 skb->h.th->doff*4 + offset, used);
2556 copied += used;
2557 len -= used;
2558
2559 /*2560 * We now will not sleep again until we are finished2561 * with skb. Sorry if you are doing the SMP port2562 * but you'll just have to fix it neatly ;)2563 */2564
2565 skb->users --;
2566
2567 if (after(sk->copied_seq,sk->urg_seq))
2568 sk->urg_data = 0;
2569 if (used + offset < skb->len)
2570 continue;
2571
2572 /*2573 * Process the FIN.2574 */2575
2576 if (skb->h.th->fin)
2577 gotofound_fin_ok;
2578 if (flags & MSG_PEEK)
2579 continue;
2580 skb->used = 1;
2581 continue;
2582
2583 found_fin_ok:
2584 ++*seq;
2585 if (flags & MSG_PEEK)
2586 break;
2587
2588 /*2589 * All is done2590 */2591
2592 skb->used = 1;
2593 sk->shutdown |= RCV_SHUTDOWN;
2594 break;
2595
2596 }2597
2598 if(copied>0 && msg->msg_name)
2599 {2600 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2601 sin->sin_family=AF_INET;
2602 sin->sin_addr.s_addr=sk->daddr;
2603 sin->sin_port=sk->dummy_th.dest;
2604 }2605 if(addr_len)
2606 *addr_len=sizeof(structsockaddr_in);
2607
2608 remove_wait_queue(sk->sleep, &wait);
2609 current->state = TASK_RUNNING;
2610
2611 /* Clean up data we have read: This will do ACK frames */2612 cleanup_rbuf(sk);
2613 release_sock(sk);
2614 returncopied;
2615 }2616
2617
2618
2619 /*2620 * State processing on a close. This implements the state shift for2621 * sending our FIN frame. Note that we only send a FIN for some 2622 * states. A shutdown() may have already sent the FIN, or we may be2623 * closed.2624 */2625
2626 staticinttcp_close_state(structsock *sk, intdead)
/* */2627 {2628 intns=TCP_CLOSE;
2629 intsend_fin=0;
2630 switch(sk->state)
2631 {2632 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2633 break;
2634 caseTCP_SYN_RECV:
2635 caseTCP_ESTABLISHED: /* Closedown begin */2636 ns=TCP_FIN_WAIT1;
2637 send_fin=1;
2638 break;
2639 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2640 caseTCP_FIN_WAIT2:
2641 caseTCP_CLOSING:
2642 ns=sk->state;
2643 break;
2644 caseTCP_CLOSE:
2645 caseTCP_LISTEN:
2646 break;
2647 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2648 wait only for the ACK */2649 ns=TCP_LAST_ACK;
2650 send_fin=1;
2651 }2652
2653 tcp_set_state(sk,ns);
2654
2655 /*2656 * This is a (useful) BSD violating of the RFC. There is a2657 * problem with TCP as specified in that the other end could2658 * keep a socket open forever with no application left this end.2659 * We use a 3 minute timeout (about the same as BSD) then kill2660 * our end. If they send after that then tough - BUT: long enough2661 * that we won't make the old 4*rto = almost no time - whoops2662 * reset mistake.2663 */2664 if(dead && ns==TCP_FIN_WAIT2)
2665 {2666 inttimer_active=del_timer(&sk->timer);
2667 if(timer_active)
2668 add_timer(&sk->timer);
2669 else2670 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2671 }2672
2673 returnsend_fin;
2674 }2675
2676 /*2677 * Send a fin.2678 */2679
2680 staticvoidtcp_send_fin(structsock *sk)
/* */2681 {2682 structproto *prot =(structproto *)sk->prot;
2683 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2684 structtcphdr *t1;
2685 structsk_buff *buff;
2686 structdevice *dev=NULL;
2687 inttmp;
2688
2689 release_sock(sk); /* in case the malloc sleeps. */2690
2691 buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2692 sk->inuse = 1;
2693
2694 if (buff == NULL)
2695 {2696 /* This is a disaster if it occurs */2697 printk("tcp_send_fin: Impossible malloc failure");
2698 return;
2699 }2700
2701 /*2702 * Administrivia2703 */2704
2705 buff->sk = sk;
2706 buff->localroute = sk->localroute;
2707
2708 /*2709 * Put in the IP header and routing stuff. 2710 */2711
2712 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2713 IPPROTO_TCP, sk->opt,
2714 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2715 if (tmp < 0)
2716 {2717 intt;
2718 /*2719 * Finish anyway, treat this as a send that got lost. 2720 * (Not good).2721 */2722
2723 buff->free = 1;
2724 sock_wfree(sk,buff);
2725 sk->write_seq++;
2726 t=del_timer(&sk->timer);
2727 if(t)
2728 add_timer(&sk->timer);
2729 else2730 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2731 return;
2732 }2733
2734 /*2735 * We ought to check if the end of the queue is a buffer and2736 * if so simply add the fin to that buffer, not send it ahead.2737 */2738
2739 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2740 buff->dev = dev;
2741 memcpy(t1, th, sizeof(*t1));
2742 buff->seq = sk->write_seq;
2743 sk->write_seq++;
2744 buff->end_seq = sk->write_seq;
2745 t1->seq = htonl(buff->seq);
2746 t1->ack = 1;
2747 t1->ack_seq = htonl(sk->acked_seq);
2748 t1->window = htons(sk->window=tcp_select_window(sk));
2749 t1->fin = 1;
2750 t1->rst = 0;
2751 t1->doff = sizeof(*t1)/4;
2752 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2753
2754 /*2755 * If there is data in the write queue, the fin must be appended to2756 * the write queue.2757 */2758
2759 if (skb_peek(&sk->write_queue) != NULL)
2760 {2761 buff->free = 0;
2762 if (buff->next != NULL)
2763 {2764 printk("tcp_send_fin: next != NULL\n");
2765 skb_unlink(buff);
2766 }2767 skb_queue_tail(&sk->write_queue, buff);
2768 }2769 else2770 {2771 sk->sent_seq = sk->write_seq;
2772 sk->prot->queue_xmit(sk, dev, buff, 0);
2773 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2774 }2775 }2776
2777 /*2778 * Shutdown the sending side of a connection. Much like close except2779 * that we don't receive shut down or set sk->dead=1.2780 */2781
2782 voidtcp_shutdown(structsock *sk, inthow)
/* */2783 {2784 /*2785 * We need to grab some memory, and put together a FIN,2786 * and then put it into the queue to be sent.2787 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2788 */2789
2790 if (!(how & SEND_SHUTDOWN))
2791 return;
2792
2793 /*2794 * If we've already sent a FIN, or it's a closed state2795 */2796
2797 if (sk->state == TCP_FIN_WAIT1 ||
2798 sk->state == TCP_FIN_WAIT2 ||
2799 sk->state == TCP_CLOSING ||
2800 sk->state == TCP_LAST_ACK ||
2801 sk->state == TCP_TIME_WAIT ||
2802 sk->state == TCP_CLOSE ||
2803 sk->state == TCP_LISTEN2804 )
2805 {2806 return;
2807 }2808 sk->inuse = 1;
2809
2810 /*2811 * flag that the sender has shutdown2812 */2813
2814 sk->shutdown |= SEND_SHUTDOWN;
2815
2816 /*2817 * Clear out any half completed packets. 2818 */2819
2820 if (sk->partial)
2821 tcp_send_partial(sk);
2822
2823 /*2824 * FIN if needed2825 */2826
2827 if(tcp_close_state(sk,0))
2828 tcp_send_fin(sk);
2829
2830 release_sock(sk);
2831 }2832
2833 /*2834 * This routine will send an RST to the other tcp. 2835 */2836
2837 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2838 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2839 {2840 structsk_buff *buff;
2841 structtcphdr *t1;
2842 inttmp;
2843 structdevice *ndev=NULL;
2844
2845 /*2846 * Cannot reset a reset (Think about it).2847 */2848
2849 if(th->rst)
2850 return;
2851
2852 /*2853 * We need to grab some memory, and put together an RST,2854 * and then put it into the queue to be sent.2855 */2856
2857 buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2858 if (buff == NULL)
2859 return;
2860
2861 buff->sk = NULL;
2862 buff->dev = dev;
2863 buff->localroute = 0;
2864
2865 /*2866 * Put in the IP header and routing stuff. 2867 */2868
2869 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2870 sizeof(structtcphdr),tos,ttl,NULL);
2871 if (tmp < 0)
2872 {2873 buff->free = 1;
2874 sock_wfree(NULL, buff);
2875 return;
2876 }2877
2878 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2879 memcpy(t1, th, sizeof(*t1));
2880
2881 /*2882 * Swap the send and the receive. 2883 */2884
2885 t1->dest = th->source;
2886 t1->source = th->dest;
2887 t1->rst = 1;
2888 t1->window = 0;
2889
2890 if(th->ack)
2891 {2892 t1->ack = 0;
2893 t1->seq = th->ack_seq;
2894 t1->ack_seq = 0;
2895 }2896 else2897 {2898 t1->ack = 1;
2899 if(!th->syn)
2900 t1->ack_seq = th->seq;
2901 else2902 t1->ack_seq = htonl(ntohl(th->seq)+1);
2903 t1->seq = 0;
2904 }2905
2906 t1->syn = 0;
2907 t1->urg = 0;
2908 t1->fin = 0;
2909 t1->psh = 0;
2910 t1->doff = sizeof(*t1)/4;
2911 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2912 prot->queue_xmit(NULL, ndev, buff, 1);
2913 tcp_statistics.TcpOutSegs++;
2914 }2915
2916
2917 /*2918 * Look for tcp options. Parses everything but only knows about MSS.2919 * This routine is always called with the packet containing the SYN.2920 * However it may also be called with the ack to the SYN. So you2921 * can't assume this is always the SYN. It's always called after2922 * we have set up sk->mtu to our own MTU.2923 *2924 * We need at minimum to add PAWS support here. Possibly large windows2925 * as Linux gets deployed on 100Mb/sec networks.2926 */2927
2928 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2929 {2930 unsignedchar *ptr;
2931 intlength=(th->doff*4)-sizeof(structtcphdr);
2932 intmss_seen = 0;
2933
2934 ptr = (unsignedchar *)(th + 1);
2935
2936 while(length>0)
2937 {2938 intopcode=*ptr++;
2939 intopsize=*ptr++;
2940 switch(opcode)
2941 {2942 caseTCPOPT_EOL:
2943 return;
2944 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2945 length--;
2946 ptr--; /* the opsize=*ptr++ above was a mistake */2947 continue;
2948
2949 default:
2950 if(opsize<=2) /* Avoid silly options looping forever */2951 return;
2952 switch(opcode)
2953 {2954 caseTCPOPT_MSS:
2955 if(opsize==4 && th->syn)
2956 {2957 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2958 mss_seen = 1;
2959 }2960 break;
2961 /* Add other options here as people feel the urge to implement stuff like large windows */2962 }2963 ptr+=opsize-2;
2964 length-=opsize;
2965 }2966 }2967 if (th->syn)
2968 {2969 if (! mss_seen)
2970 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2971 }2972 #ifdefCONFIG_INET_PCTCP2973 sk->mss = min(sk->max_window >> 1, sk->mtu);
2974 #else2975 sk->mss = min(sk->max_window, sk->mtu);
2976 sk->max_unacked = 2 * sk->mss;
2977 #endif2978 }2979
2980 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2981 {2982 dst = ntohl(dst);
2983 if (IN_CLASSA(dst))
2984 returnhtonl(IN_CLASSA_NET);
2985 if (IN_CLASSB(dst))
2986 returnhtonl(IN_CLASSB_NET);
2987 returnhtonl(IN_CLASSC_NET);
2988 }2989
2990 /*2991 * Default sequence number picking algorithm.2992 * As close as possible to RFC 793, which2993 * suggests using a 250kHz clock.2994 * Further reading shows this assumes 2MB/s networks.2995 * For 10MB/s ethernet, a 1MHz clock is appropriate.2996 * That's funny, Linux has one built in! Use it!2997 */2998
2999 externinlineu32tcp_init_seq(void)
/* */3000 {3001 structtimevaltv;
3002 do_gettimeofday(&tv);
3003 returntv.tv_usec+tv.tv_sec*1000000;
3004 }3005
3006 /*3007 * This routine handles a connection request.3008 * It should make sure we haven't already responded.3009 * Because of the way BSD works, we have to send a syn/ack now.3010 * This also means it will be harder to close a socket which is3011 * listening.3012 */3013
3014 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */3015 unsignedlongdaddr, unsignedlongsaddr,
3016 structoptions *opt, structdevice *dev, u32seq)
3017 {3018 structsk_buff *buff;
3019 structtcphdr *t1;
3020 unsignedchar *ptr;
3021 structsock *newsk;
3022 structtcphdr *th;
3023 structdevice *ndev=NULL;
3024 inttmp;
3025 structrtable *rt;
3026
3027 th = skb->h.th;
3028
3029 /* If the socket is dead, don't accept the connection. */3030 if (!sk->dead)
3031 {3032 sk->data_ready(sk,0);
3033 }3034 else3035 {3036 if(sk->debug)
3037 printk("Reset on %p: Connect on dead socket.\n",sk);
3038 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
3039 tcp_statistics.TcpAttemptFails++;
3040 kfree_skb(skb, FREE_READ);
3041 return;
3042 }3043
3044 /*3045 * Make sure we can accept more. This will prevent a3046 * flurry of syns from eating up all our memory.3047 */3048
3049 if (sk->ack_backlog >= sk->max_ack_backlog)
3050 {3051 tcp_statistics.TcpAttemptFails++;
3052 kfree_skb(skb, FREE_READ);
3053 return;
3054 }3055
3056 /*3057 * We need to build a new sock struct.3058 * It is sort of bad to have a socket without an inode attached3059 * to it, but the wake_up's will just wake up the listening socket,3060 * and if the listening socket is destroyed before this is taken3061 * off of the queue, this will take care of it.3062 */3063
3064 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
3065 if (newsk == NULL)
3066 {3067 /* just ignore the syn. It will get retransmitted. */3068 tcp_statistics.TcpAttemptFails++;
3069 kfree_skb(skb, FREE_READ);
3070 return;
3071 }3072
3073 memcpy(newsk, sk, sizeof(*newsk));
3074 newsk->opt = NULL;
3075 newsk->ip_route_cache = NULL;
3076 if (opt && opt->optlen) {3077 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
3078 if (!sk->opt) {3079 kfree_s(newsk, sizeof(structsock));
3080 tcp_statistics.TcpAttemptFails++;
3081 kfree_skb(skb, FREE_READ);
3082 return;
3083 }3084 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {3085 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
3086 kfree_s(newsk, sizeof(structsock));
3087 tcp_statistics.TcpAttemptFails++;
3088 kfree_skb(skb, FREE_READ);
3089 return;
3090 }3091 }3092 skb_queue_head_init(&newsk->write_queue);
3093 skb_queue_head_init(&newsk->receive_queue);
3094 newsk->send_head = NULL;
3095 newsk->send_tail = NULL;
3096 skb_queue_head_init(&newsk->back_log);
3097 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/3098 newsk->rto = TCP_TIMEOUT_INIT;
3099 newsk->mdev = 0;
3100 newsk->max_window = 0;
3101 newsk->cong_window = 1;
3102 newsk->cong_count = 0;
3103 newsk->ssthresh = 0;
3104 newsk->backoff = 0;
3105 newsk->blog = 0;
3106 newsk->intr = 0;
3107 newsk->proc = 0;
3108 newsk->done = 0;
3109 newsk->partial = NULL;
3110 newsk->pair = NULL;
3111 newsk->wmem_alloc = 0;
3112 newsk->rmem_alloc = 0;
3113 newsk->localroute = sk->localroute;
3114
3115 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3116
3117 newsk->err = 0;
3118 newsk->shutdown = 0;
3119 newsk->ack_backlog = 0;
3120 newsk->acked_seq = skb->seq+1;
3121 newsk->lastwin_seq = skb->seq+1;
3122 newsk->delay_acks = 1;
3123 newsk->copied_seq = skb->seq+1;
3124 newsk->fin_seq = skb->seq;
3125 newsk->state = TCP_SYN_RECV;
3126 newsk->timeout = 0;
3127 newsk->ip_xmit_timeout = 0;
3128 newsk->write_seq = seq;
3129 newsk->window_seq = newsk->write_seq;
3130 newsk->rcv_ack_seq = newsk->write_seq;
3131 newsk->urg_data = 0;
3132 newsk->retransmits = 0;
3133 newsk->linger=0;
3134 newsk->destroy = 0;
3135 init_timer(&newsk->timer);
3136 newsk->timer.data = (unsignedlong)newsk;
3137 newsk->timer.function = &net_timer;
3138 init_timer(&newsk->retransmit_timer);
3139 newsk->retransmit_timer.data = (unsignedlong)newsk;
3140 newsk->retransmit_timer.function=&retransmit_timer;
3141 newsk->dummy_th.source = skb->h.th->dest;
3142 newsk->dummy_th.dest = skb->h.th->source;
3143
3144 /*3145 * Swap these two, they are from our point of view. 3146 */3147
3148 newsk->daddr = saddr;
3149 newsk->saddr = daddr;
3150 newsk->rcv_saddr = daddr;
3151
3152 put_sock(newsk->num,newsk);
3153 newsk->dummy_th.res1 = 0;
3154 newsk->dummy_th.doff = 6;
3155 newsk->dummy_th.fin = 0;
3156 newsk->dummy_th.syn = 0;
3157 newsk->dummy_th.rst = 0;
3158 newsk->dummy_th.psh = 0;
3159 newsk->dummy_th.ack = 0;
3160 newsk->dummy_th.urg = 0;
3161 newsk->dummy_th.res2 = 0;
3162 newsk->acked_seq = skb->seq + 1;
3163 newsk->copied_seq = skb->seq + 1;
3164 newsk->socket = NULL;
3165
3166 /*3167 * Grab the ttl and tos values and use them 3168 */3169
3170 newsk->ip_ttl=sk->ip_ttl;
3171 newsk->ip_tos=skb->ip_hdr->tos;
3172
3173 /*3174 * Use 512 or whatever user asked for 3175 */3176
3177 /*3178 * Note use of sk->user_mss, since user has no direct access to newsk 3179 */3180
3181 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3182 newsk->ip_route_cache = rt;
3183
3184 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3185 newsk->window_clamp = rt->rt_window;
3186 else3187 newsk->window_clamp = 0;
3188
3189 if (sk->user_mss)
3190 newsk->mtu = sk->user_mss;
3191 elseif (rt)
3192 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
3193 else3194 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
3195
3196 /*3197 * But not bigger than device MTU 3198 */3199
3200 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
3201
3202 #ifdefCONFIG_SKIP3203
3204 /*3205 * SKIP devices set their MTU to 65535. This is so they can take packets3206 * unfragmented to security process then fragment. They could lie to the3207 * TCP layer about a suitable MTU, but its easier to let skip sort it out3208 * simply because the final package we want unfragmented is going to be3209 *3210 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]3211 */3212
3213 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */3214 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3215 #endif3216 /*3217 * This will min with what arrived in the packet 3218 */3219
3220 tcp_options(newsk,skb->h.th);
3221
3222 tcp_cache_zap();
3223
3224 buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3225 if (buff == NULL)
3226 {3227 sk->err = ENOMEM;
3228 newsk->dead = 1;
3229 newsk->state = TCP_CLOSE;
3230 /* And this will destroy it */3231 release_sock(newsk);
3232 kfree_skb(skb, FREE_READ);
3233 tcp_statistics.TcpAttemptFails++;
3234 return;
3235 }3236
3237 buff->sk = newsk;
3238 buff->localroute = newsk->localroute;
3239
3240 /*3241 * Put in the IP header and routing stuff. 3242 */3243
3244 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3245 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3246
3247 /*3248 * Something went wrong. 3249 */3250
3251 if (tmp < 0)
3252 {3253 sk->err = tmp;
3254 buff->free = 1;
3255 kfree_skb(buff,FREE_WRITE);
3256 newsk->dead = 1;
3257 newsk->state = TCP_CLOSE;
3258 release_sock(newsk);
3259 skb->sk = sk;
3260 kfree_skb(skb, FREE_READ);
3261 tcp_statistics.TcpAttemptFails++;
3262 return;
3263 }3264
3265 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
3266
3267 memcpy(t1, skb->h.th, sizeof(*t1));
3268 buff->seq = newsk->write_seq++;
3269 buff->end_seq = newsk->write_seq;
3270 /*3271 * Swap the send and the receive. 3272 */3273 t1->dest = skb->h.th->source;
3274 t1->source = newsk->dummy_th.source;
3275 t1->seq = ntohl(buff->seq);
3276 t1->ack = 1;
3277 newsk->sent_seq = newsk->write_seq;
3278 t1->window = ntohs(tcp_select_window(newsk));
3279 t1->res1 = 0;
3280 t1->res2 = 0;
3281 t1->rst = 0;
3282 t1->urg = 0;
3283 t1->psh = 0;
3284 t1->syn = 1;
3285 t1->ack_seq = htonl(newsk->acked_seq);
3286 t1->doff = sizeof(*t1)/4+1;
3287 ptr = skb_put(buff,4);
3288 ptr[0] = 2;
3289 ptr[1] = 4;
3290 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3291 ptr[3] =(newsk->mtu) & 0xff;
3292
3293 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3294 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3295 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3296 skb->sk = newsk;
3297
3298 /*3299 * Charge the sock_buff to newsk. 3300 */3301
3302 sk->rmem_alloc -= skb->truesize;
3303 newsk->rmem_alloc += skb->truesize;
3304
3305 skb_queue_tail(&sk->receive_queue,skb);
3306 sk->ack_backlog++;
3307 release_sock(newsk);
3308 tcp_statistics.TcpOutSegs++;
3309 }3310
3311
3312 staticvoidtcp_close(structsock *sk, inttimeout)
/* */3313 {3314 /*3315 * We need to grab some memory, and put together a FIN, 3316 * and then put it into the queue to be sent.3317 */3318
3319 sk->inuse = 1;
3320
3321 if(th_cache_sk==sk)
3322 tcp_cache_zap();
3323 if(sk->state == TCP_LISTEN)
3324 {3325 /* Special case */3326 tcp_set_state(sk, TCP_CLOSE);
3327 tcp_close_pending(sk);
3328 release_sock(sk);
3329 return;
3330 }3331
3332 sk->keepopen = 1;
3333 sk->shutdown = SHUTDOWN_MASK;
3334
3335 if (!sk->dead)
3336 sk->state_change(sk);
3337
3338 if (timeout == 0)
3339 {3340 structsk_buff *skb;
3341
3342 /*3343 * We need to flush the recv. buffs. We do this only on the3344 * descriptor close, not protocol-sourced closes, because the3345 * reader process may not have drained the data yet!3346 */3347
3348 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3349 kfree_skb(skb, FREE_READ);
3350 /*3351 * Get rid off any half-completed packets. 3352 */3353
3354 if (sk->partial)
3355 tcp_send_partial(sk);
3356 }3357
3358
3359 /*3360 * Timeout is not the same thing - however the code likes3361 * to send both the same way (sigh).3362 */3363
3364 if(timeout)
3365 {3366 tcp_set_state(sk, TCP_CLOSE); /* Dead */3367 }3368 else3369 {3370 if(tcp_close_state(sk,1)==1)
3371 {3372 tcp_send_fin(sk);
3373 }3374 }3375 release_sock(sk);
3376 }3377
3378
3379 /*3380 * This routine takes stuff off of the write queue,3381 * and puts it in the xmit queue. This happens as incoming acks3382 * open up the remote window for us.3383 */3384
3385 staticvoidtcp_write_xmit(structsock *sk)
/* */3386 {3387 structsk_buff *skb;
3388
3389 /*3390 * The bytes will have to remain here. In time closedown will3391 * empty the write queue and all will be happy 3392 */3393
3394 if(sk->zapped)
3395 return;
3396
3397 /*3398 * Anything on the transmit queue that fits the window can3399 * be added providing we are not3400 *3401 * a) retransmitting (Nagle's rule)3402 * b) exceeding our congestion window.3403 */3404
3405 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3406 before(skb->end_seq, sk->window_seq + 1) &&
3407 (sk->retransmits == 0 ||
3408 sk->ip_xmit_timeout != TIME_WRITE ||
3409 before(skb->end_seq, sk->rcv_ack_seq + 1))
3410 && sk->packets_out < sk->cong_window)
3411 {3412 IS_SKB(skb);
3413 skb_unlink(skb);
3414
3415 /*3416 * See if we really need to send the packet. 3417 */3418
3419 if (before(skb->end_seq, sk->rcv_ack_seq +1))
3420 {3421 /*3422 * This is acked data. We can discard it. This 3423 * cannot currently occur.3424 */3425
3426 sk->retransmits = 0;
3427 kfree_skb(skb, FREE_WRITE);
3428 if (!sk->dead)
3429 sk->write_space(sk);
3430 }3431 else3432 {3433 structtcphdr *th;
3434 structiphdr *iph;
3435 intsize;
3436 /*3437 * put in the ack seq and window at this point rather than earlier,3438 * in order to keep them monotonic. We really want to avoid taking3439 * back window allocations. That's legal, but RFC1122 says it's frowned on.3440 * Ack and window will in general have changed since this packet was put3441 * on the write queue.3442 */3443 iph = skb->ip_hdr;
3444 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3445 size = skb->len - (((unsignedchar *) th) - skb->data);
3446 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY3447 if (size > sk->mtu - sizeof(structiphdr))
3448 {3449 iph->frag_off &= ~htons(IP_DF);
3450 ip_send_check(iph);
3451 }3452 #endif3453
3454 th->ack_seq = htonl(sk->acked_seq);
3455 th->window = htons(tcp_select_window(sk));
3456
3457 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3458
3459 sk->sent_seq = skb->end_seq;
3460
3461 /*3462 * IP manages our queue for some crazy reason3463 */3464
3465 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3466
3467
3468 sk->ack_backlog = 0;
3469 sk->bytes_rcv = 0;
3470
3471 /*3472 * Again we slide the timer wrongly3473 */3474
3475 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3476 }3477 }3478 }3479
3480
3481 /*3482 * This routine deals with incoming acks, but not outgoing ones.3483 */3484
3485 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3486 {3487 u32ack;
3488 intflag = 0;
3489
3490 /* 3491 * 1 - there was data in packet as well as ack or new data is sent or 3492 * in shutdown state3493 * 2 - data from retransmit queue was acked and removed3494 * 4 - window shrunk or data from retransmit queue was acked and removed3495 */3496
3497 if(sk->zapped)
3498 return(1); /* Dead, cant ack any more so why bother */3499
3500 /*3501 * Have we discovered a larger window3502 */3503
3504 ack = ntohl(th->ack_seq);
3505
3506 if (ntohs(th->window) > sk->max_window)
3507 {3508 sk->max_window = ntohs(th->window);
3509 #ifdefCONFIG_INET_PCTCP3510 /* Hack because we don't send partial packets to non SWS3511 handling hosts */3512 sk->mss = min(sk->max_window>>1, sk->mtu);
3513 #else3514 sk->mss = min(sk->max_window, sk->mtu);
3515 #endif3516 }3517
3518 /*3519 * We have dropped back to keepalive timeouts. Thus we have3520 * no retransmits pending.3521 */3522
3523 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3524 sk->retransmits = 0;
3525
3526 /*3527 * If the ack is newer than sent or older than previous acks3528 * then we can probably ignore it.3529 */3530
3531 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3532 {3533 if(sk->debug)
3534 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3535
3536 /*3537 * Keepalive processing.3538 */3539
3540 if (after(ack, sk->sent_seq))
3541 {3542 return(0);
3543 }3544
3545 /*3546 * Restart the keepalive timer.3547 */3548
3549 if (sk->keepopen)
3550 {3551 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3552 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3553 }3554 return(1);
3555 }3556
3557 /*3558 * If there is data set flag 13559 */3560
3561 if (len != th->doff*4)
3562 flag |= 1;
3563
3564 /*3565 * See if our window has been shrunk. 3566 */3567
3568 if (after(sk->window_seq, ack+ntohs(th->window)))
3569 {3570 /*3571 * We may need to move packets from the send queue3572 * to the write queue, if the window has been shrunk on us.3573 * The RFC says you are not allowed to shrink your window3574 * like this, but if the other end does, you must be able3575 * to deal with it.3576 */3577 structsk_buff *skb;
3578 structsk_buff *skb2;
3579 structsk_buff *wskb = NULL;
3580
3581 skb2 = sk->send_head;
3582 sk->send_head = NULL;
3583 sk->send_tail = NULL;
3584
3585 /*3586 * This is an artifact of a flawed concept. We want one3587 * queue and a smarter send routine when we send all.3588 */3589
3590 flag |= 4; /* Window changed */3591
3592 sk->window_seq = ack + ntohs(th->window);
3593 cli();
3594 while (skb2 != NULL)
3595 {3596 skb = skb2;
3597 skb2 = skb->link3;
3598 skb->link3 = NULL;
3599 if (after(skb->end_seq, sk->window_seq))
3600 {3601 if (sk->packets_out > 0)
3602 sk->packets_out--;
3603 /* We may need to remove this from the dev send list. */3604 if (skb->next != NULL)
3605 {3606 skb_unlink(skb);
3607 }3608 /* Now add it to the write_queue. */3609 if (wskb == NULL)
3610 skb_queue_head(&sk->write_queue,skb);
3611 else3612 skb_append(wskb,skb);
3613 wskb = skb;
3614 }3615 else3616 {3617 if (sk->send_head == NULL)
3618 {3619 sk->send_head = skb;
3620 sk->send_tail = skb;
3621 }3622 else3623 {3624 sk->send_tail->link3 = skb;
3625 sk->send_tail = skb;
3626 }3627 skb->link3 = NULL;
3628 }3629 }3630 sti();
3631 }3632
3633 /*3634 * Pipe has emptied3635 */3636
3637 if (sk->send_tail == NULL || sk->send_head == NULL)
3638 {3639 sk->send_head = NULL;
3640 sk->send_tail = NULL;
3641 sk->packets_out= 0;
3642 }3643
3644 /*3645 * Update the right hand window edge of the host3646 */3647
3648 sk->window_seq = ack + ntohs(th->window);
3649
3650 /*3651 * We don't want too many packets out there. 3652 */3653
3654 if (sk->ip_xmit_timeout == TIME_WRITE &&
3655 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3656 {3657 /* 3658 * This is Jacobson's slow start and congestion avoidance. 3659 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3660 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3661 * counter and increment it once every cwnd times. It's possible3662 * that this should be done only if sk->retransmits == 0. I'm3663 * interpreting "new data is acked" as including data that has3664 * been retransmitted but is just now being acked.3665 */3666 if (sk->cong_window < sk->ssthresh)
3667 /* 3668 * In "safe" area, increase3669 */3670 sk->cong_window++;
3671 else3672 {3673 /*3674 * In dangerous area, increase slowly. In theory this is3675 * sk->cong_window += 1 / sk->cong_window3676 */3677 if (sk->cong_count >= sk->cong_window)
3678 {3679 sk->cong_window++;
3680 sk->cong_count = 0;
3681 }3682 else3683 sk->cong_count++;
3684 }3685 }3686
3687 /*3688 * Remember the highest ack received.3689 */3690
3691 sk->rcv_ack_seq = ack;
3692
3693 /*3694 * We passed data and got it acked, remove any soft error3695 * log. Something worked...3696 */3697
3698 sk->err_soft = 0;
3699
3700 /*3701 * If this ack opens up a zero window, clear backoff. It was3702 * being used to time the probes, and is probably far higher than3703 * it needs to be for normal retransmission.3704 */3705
3706 if (sk->ip_xmit_timeout == TIME_PROBE0)
3707 {3708 sk->retransmits = 0; /* Our probe was answered */3709
3710 /*3711 * Was it a usable window open ?3712 */3713
3714 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3715 ! before (sk->window_seq, sk->write_queue.next->end_seq))
3716 {3717 sk->backoff = 0;
3718
3719 /*3720 * Recompute rto from rtt. this eliminates any backoff.3721 */3722
3723 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3724 if (sk->rto > 120*HZ)
3725 sk->rto = 120*HZ;
3726 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about3727 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3728 .2 of a second is going to need huge windows (SIGH) */3729 sk->rto = HZ/5;
3730 }3731 }3732
3733 /* 3734 * See if we can take anything off of the retransmit queue.3735 */3736
3737 while(sk->send_head != NULL)
3738 {3739 /* Check for a bug. */3740 if (sk->send_head->link3 &&
3741 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
3742 printk("INET: tcp.c: *** bug send_list out of order.\n");
3743
3744 /*3745 * If our packet is before the ack sequence we can3746 * discard it as it's confirmed to have arrived the other end.3747 */3748
3749 if (before(sk->send_head->end_seq, ack+1))
3750 {3751 structsk_buff *oskb;
3752 if (sk->retransmits)
3753 {3754 /*3755 * We were retransmitting. don't count this in RTT est 3756 */3757 flag |= 2;
3758
3759 /*3760 * even though we've gotten an ack, we're still3761 * retransmitting as long as we're sending from3762 * the retransmit queue. Keeping retransmits non-zero3763 * prevents us from getting new data interspersed with3764 * retransmissions.3765 */3766
3767 if (sk->send_head->link3) /* Any more queued retransmits? */3768 sk->retransmits = 1;
3769 else3770 sk->retransmits = 0;
3771 }3772 /*3773 * Note that we only reset backoff and rto in the3774 * rtt recomputation code. And that doesn't happen3775 * if there were retransmissions in effect. So the3776 * first new packet after the retransmissions is3777 * sent with the backoff still in effect. Not until3778 * we get an ack from a non-retransmitted packet do3779 * we reset the backoff and rto. This allows us to deal3780 * with a situation where the network delay has increased3781 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3782 */3783
3784 /*3785 * We have one less packet out there. 3786 */3787
3788 if (sk->packets_out > 0)
3789 sk->packets_out --;
3790 /* 3791 * Wake up the process, it can probably write more. 3792 */3793 if (!sk->dead)
3794 sk->write_space(sk);
3795 oskb = sk->send_head;
3796
3797 if (!(flag&2)) /* Not retransmitting */3798 {3799 longm;
3800
3801 /*3802 * The following amusing code comes from Jacobson's3803 * article in SIGCOMM '88. Note that rtt and mdev3804 * are scaled versions of rtt and mean deviation.3805 * This is designed to be as fast as possible 3806 * m stands for "measurement".3807 */3808
3809 m = jiffies - oskb->when; /* RTT */3810 if(m<=0)
3811 m=1; /* IS THIS RIGHT FOR <0 ??? */3812 m -= (sk->rtt >> 3); /* m is now error in rtt est */3813 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3814 if (m < 0)
3815 m = -m; /* m is now abs(error) */3816 m -= (sk->mdev >> 2); /* similar update on mdev */3817 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3818
3819 /*3820 * Now update timeout. Note that this removes any backoff.3821 */3822
3823 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3824 if (sk->rto > 120*HZ)
3825 sk->rto = 120*HZ;
3826 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3827 sk->rto = HZ/5;
3828 sk->backoff = 0;
3829 }3830 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3831 In this case as we just set it up */3832 cli();
3833 oskb = sk->send_head;
3834 IS_SKB(oskb);
3835 sk->send_head = oskb->link3;
3836 if (sk->send_head == NULL)
3837 {3838 sk->send_tail = NULL;
3839 }3840
3841 /*3842 * We may need to remove this from the dev send list. 3843 */3844
3845 if (oskb->next)
3846 skb_unlink(oskb);
3847 sti();
3848 kfree_skb(oskb, FREE_WRITE); /* write. */3849 if (!sk->dead)
3850 sk->write_space(sk);
3851 }3852 else3853 {3854 break;
3855 }3856 }3857
3858 /*3859 * XXX someone ought to look at this too.. at the moment, if skb_peek()3860 * returns non-NULL, we complete ignore the timer stuff in the else3861 * clause. We ought to organize the code so that else clause can3862 * (should) be executed regardless, possibly moving the PROBE timer3863 * reset over. The skb_peek() thing should only move stuff to the3864 * write queue, NOT also manage the timer functions.3865 */3866
3867 /*3868 * Maybe we can take some stuff off of the write queue,3869 * and put it onto the xmit queue.3870 */3871 if (skb_peek(&sk->write_queue) != NULL)
3872 {3873 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3874 (sk->retransmits == 0 ||
3875 sk->ip_xmit_timeout != TIME_WRITE ||
3876 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3877 && sk->packets_out < sk->cong_window)
3878 {3879 /*3880 * Add more data to the send queue.3881 */3882 flag |= 1;
3883 tcp_write_xmit(sk);
3884 }3885 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3886 sk->send_head == NULL &&
3887 sk->ack_backlog == 0 &&
3888 sk->state != TCP_TIME_WAIT)
3889 {3890 /*3891 * Data to queue but no room.3892 */3893 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3894 }3895 }3896 else3897 {3898 /*3899 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3900 * from TCP_CLOSE we don't do anything3901 *3902 * from anything else, if there is write data (or fin) pending,3903 * we use a TIME_WRITE timeout, else if keepalive we reset to3904 * a KEEPALIVE timeout, else we delete the timer.3905 *3906 * We do not set flag for nominal write data, otherwise we may3907 * force a state where we start to write itsy bitsy tidbits3908 * of data.3909 */3910
3911 switch(sk->state) {3912 caseTCP_TIME_WAIT:
3913 /*3914 * keep us in TIME_WAIT until we stop getting packets,3915 * reset the timeout.3916 */3917 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3918 break;
3919 caseTCP_CLOSE:
3920 /*3921 * don't touch the timer.3922 */3923 break;
3924 default:
3925 /*3926 * Must check send_head, write_queue, and ack_backlog3927 * to determine which timeout to use.3928 */3929 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3930 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3931 }elseif (sk->keepopen) {3932 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3933 }else{3934 del_timer(&sk->retransmit_timer);
3935 sk->ip_xmit_timeout = 0;
3936 }3937 break;
3938 }3939 }3940
3941 /*3942 * We have nothing queued but space to send. Send any partial3943 * packets immediately (end of Nagle rule application).3944 */3945
3946 if (sk->packets_out == 0 && sk->partial != NULL &&
3947 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3948 {3949 flag |= 1;
3950 tcp_send_partial(sk);
3951 }3952
3953 /*3954 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3955 * we are now waiting for an acknowledge to our FIN. The other end is3956 * already in TIME_WAIT.3957 *3958 * Move to TCP_CLOSE on success.3959 */3960
3961 if (sk->state == TCP_LAST_ACK)
3962 {3963 if (!sk->dead)
3964 sk->state_change(sk);
3965 if(sk->debug)
3966 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3967 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3968 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3969 {3970 flag |= 1;
3971 sk->shutdown = SHUTDOWN_MASK;
3972 tcp_set_state(sk,TCP_CLOSE);
3973 return 1;
3974 }3975 }3976
3977 /*3978 * Incoming ACK to a FIN we sent in the case of our initiating the close.3979 *3980 * Move to FIN_WAIT2 to await a FIN from the other end. Set3981 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3982 */3983
3984 if (sk->state == TCP_FIN_WAIT1)
3985 {3986
3987 if (!sk->dead)
3988 sk->state_change(sk);
3989 if (sk->rcv_ack_seq == sk->write_seq)
3990 {3991 flag |= 1;
3992 sk->shutdown |= SEND_SHUTDOWN;
3993 tcp_set_state(sk, TCP_FIN_WAIT2);
3994 }3995 }3996
3997 /*3998 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3999 *4000 * Move to TIME_WAIT4001 */4002
4003 if (sk->state == TCP_CLOSING)
4004 {4005
4006 if (!sk->dead)
4007 sk->state_change(sk);
4008 if (sk->rcv_ack_seq == sk->write_seq)
4009 {4010 flag |= 1;
4011 tcp_time_wait(sk);
4012 }4013 }4014
4015 /*4016 * Final ack of a three way shake 4017 */4018
4019 if(sk->state==TCP_SYN_RECV)
4020 {4021 tcp_set_state(sk, TCP_ESTABLISHED);
4022 tcp_options(sk,th);
4023 sk->dummy_th.dest=th->source;
4024 sk->copied_seq = sk->acked_seq;
4025 if(!sk->dead)
4026 sk->state_change(sk);
4027 if(sk->max_window==0)
4028 {4029 sk->max_window=32; /* Sanity check */4030 sk->mss=min(sk->max_window,sk->mtu);
4031 }4032 }4033
4034 /*4035 * I make no guarantees about the first clause in the following4036 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under4037 * what conditions "!flag" would be true. However I think the rest4038 * of the conditions would prevent that from causing any4039 * unnecessary retransmission. 4040 * Clearly if the first packet has expired it should be 4041 * retransmitted. The other alternative, "flag&2 && retransmits", is4042 * harder to explain: You have to look carefully at how and when the4043 * timer is set and with what timeout. The most recent transmission always4044 * sets the timer. So in general if the most recent thing has timed4045 * out, everything before it has as well. So we want to go ahead and4046 * retransmit some more. If we didn't explicitly test for this4047 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"4048 * would not be true. If you look at the pattern of timing, you can4049 * show that rto is increased fast enough that the next packet would4050 * almost never be retransmitted immediately. Then you'd end up4051 * waiting for a timeout to send each packet on the retransmission4052 * queue. With my implementation of the Karn sampling algorithm,4053 * the timeout would double each time. The net result is that it would4054 * take a hideous amount of time to recover from a single dropped packet.4055 * It's possible that there should also be a test for TIME_WRITE, but4056 * I think as long as "send_head != NULL" and "retransmit" is on, we've4057 * got to be in real retransmission mode.4058 * Note that tcp_do_retransmit is called with all==1. Setting cong_window4059 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.4060 * As long as no further losses occur, this seems reasonable.4061 */4062
4063 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
4064 (((flag&2) && sk->retransmits) ||
4065 (sk->send_head->when + sk->rto < jiffies)))
4066 {4067 if(sk->send_head->when + sk->rto < jiffies)
4068 tcp_retransmit(sk,0);
4069 else4070 {4071 tcp_do_retransmit(sk, 1);
4072 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4073 }4074 }4075
4076 return(1);
4077 }4078
4079
4080 /*4081 * Process the FIN bit. This now behaves as it is supposed to work4082 * and the FIN takes effect when it is validly part of sequence4083 * space. Not before when we get holes.4084 *4085 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT4086 * (and thence onto LAST-ACK and finally, CLOSE, we never enter4087 * TIME-WAIT)4088 *4089 * If we are in FINWAIT-1, a received FIN indicates simultaneous4090 * close and we go into CLOSING (and later onto TIME-WAIT)4091 *4092 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.4093 *4094 */4095
4096 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */4097 {4098 sk->fin_seq = skb->end_seq;
4099
4100 if (!sk->dead)
4101 {4102 sk->state_change(sk);
4103 sock_wake_async(sk->socket, 1);
4104 }4105
4106 switch(sk->state)
4107 {4108 caseTCP_SYN_RECV:
4109 caseTCP_SYN_SENT:
4110 caseTCP_ESTABLISHED:
4111 /*4112 * move to CLOSE_WAIT, tcp_data() already handled4113 * sending the ack.4114 */4115 tcp_set_state(sk,TCP_CLOSE_WAIT);
4116 if (th->rst)
4117 sk->shutdown = SHUTDOWN_MASK;
4118 break;
4119
4120 caseTCP_CLOSE_WAIT:
4121 caseTCP_CLOSING:
4122 /*4123 * received a retransmission of the FIN, do4124 * nothing.4125 */4126 break;
4127 caseTCP_TIME_WAIT:
4128 /*4129 * received a retransmission of the FIN,4130 * restart the TIME_WAIT timer.4131 */4132 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4133 return(0);
4134 caseTCP_FIN_WAIT1:
4135 /*4136 * This case occurs when a simultaneous close4137 * happens, we must ack the received FIN and4138 * enter the CLOSING state.4139 *4140 * This causes a WRITE timeout, which will either4141 * move on to TIME_WAIT when we timeout, or resend4142 * the FIN properly (maybe we get rid of that annoying4143 * FIN lost hang). The TIME_WRITE code is already correct4144 * for handling this timeout.4145 */4146
4147 if(sk->ip_xmit_timeout != TIME_WRITE)
4148 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4149 tcp_set_state(sk,TCP_CLOSING);
4150 break;
4151 caseTCP_FIN_WAIT2:
4152 /*4153 * received a FIN -- send ACK and enter TIME_WAIT4154 */4155 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4156 sk->shutdown|=SHUTDOWN_MASK;
4157 tcp_set_state(sk,TCP_TIME_WAIT);
4158 break;
4159 caseTCP_CLOSE:
4160 /*4161 * already in CLOSE4162 */4163 break;
4164 default:
4165 tcp_set_state(sk,TCP_LAST_ACK);
4166
4167 /* Start the timers. */4168 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4169 return(0);
4170 }4171
4172 return(0);
4173 }4174
4175
4176
4177 /*4178 * This routine handles the data. If there is room in the buffer,4179 * it will be have already been moved into it. If there is no4180 * room, then we will just have to discard the packet.4181 */4182
4183 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */4184 unsignedlongsaddr, unsignedshortlen)
4185 {4186 structsk_buff *skb1, *skb2;
4187 structtcphdr *th;
4188 intdup_dumped=0;
4189 u32new_seq, shut_seq;
4190
4191 th = skb->h.th;
4192 skb_pull(skb,th->doff*4);
4193 skb_trim(skb,len-(th->doff*4));
4194
4195 /*4196 * The bytes in the receive read/assembly queue has increased. Needed for the4197 * low memory discard algorithm 4198 */4199
4200 sk->bytes_rcv += skb->len;
4201
4202 if (skb->len == 0 && !th->fin)
4203 {4204 /* 4205 * Don't want to keep passing ack's back and forth. 4206 * (someone sent us dataless, boring frame)4207 */4208 if (!th->ack)
4209 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4210 kfree_skb(skb, FREE_READ);
4211 return(0);
4212 }4213
4214 /*4215 * We no longer have anyone receiving data on this connection.4216 */4217
4218 #ifndef TCP_DONT_RST_SHUTDOWN
4219
4220 if(sk->shutdown & RCV_SHUTDOWN)
4221 {4222 /*4223 * FIXME: BSD has some magic to avoid sending resets to4224 * broken 4.2 BSD keepalives. Much to my surprise a few non4225 * BSD stacks still have broken keepalives so we want to4226 * cope with it.4227 */4228
4229 if(skb->len) /* We don't care if it's just an ack or4230 a keepalive/window probe */4231 {4232 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */4233
4234 /* Do this the way 4.4BSD treats it. Not what I'd4235 regard as the meaning of the spec but it's what BSD4236 does and clearly they know everything 8) */4237
4238 /*4239 * This is valid because of two things4240 *4241 * a) The way tcp_data behaves at the bottom.4242 * b) A fin takes effect when read not when received.4243 */4244
4245 shut_seq = sk->acked_seq+1; /* Last byte */4246
4247 if(after(new_seq,shut_seq))
4248 {4249 if(sk->debug)
4250 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4251 sk, new_seq, shut_seq, sk->blog);
4252 if(sk->dead)
4253 {4254 sk->acked_seq = new_seq + th->fin;
4255 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4256 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4257 tcp_statistics.TcpEstabResets++;
4258 sk->err = EPIPE;
4259 sk->error_report(sk);
4260 sk->shutdown = SHUTDOWN_MASK;
4261 tcp_set_state(sk,TCP_CLOSE);
4262 kfree_skb(skb, FREE_READ);
4263 return 0;
4264 }4265 }4266 }4267 }4268
4269 #endif4270
4271 /*4272 * Now we have to walk the chain, and figure out where this one4273 * goes into it. This is set up so that the last packet we received4274 * will be the first one we look at, that way if everything comes4275 * in order, there will be no performance loss, and if they come4276 * out of order we will be able to fit things in nicely.4277 *4278 * [AC: This is wrong. We should assume in order first and then walk4279 * forwards from the first hole based upon real traffic patterns.]4280 * 4281 */4282
4283 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */4284 {4285 skb_queue_head(&sk->receive_queue,skb);
4286 skb1= NULL;
4287 }4288 else4289 {4290 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4291 {4292 if(sk->debug)
4293 {4294 printk("skb1=%p :", skb1);
4295 printk("skb1->seq = %d: ", skb1->seq);
4296 printk("skb->seq = %d\n",skb->seq);
4297 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4298 sk->acked_seq);
4299 }4300
4301 /*4302 * Optimisation: Duplicate frame or extension of previous frame from4303 * same sequence point (lost ack case).4304 * The frame contains duplicate data or replaces a previous frame4305 * discard the previous frame (safe as sk->inuse is set) and put4306 * the new one in its place.4307 */4308
4309 if (skb->seq==skb1->seq && skb->len>=skb1->len)
4310 {4311 skb_append(skb1,skb);
4312 skb_unlink(skb1);
4313 kfree_skb(skb1,FREE_READ);
4314 dup_dumped=1;
4315 skb1=NULL;
4316 break;
4317 }4318
4319 /*4320 * Found where it fits4321 */4322
4323 if (after(skb->seq+1, skb1->seq))
4324 {4325 skb_append(skb1,skb);
4326 break;
4327 }4328
4329 /*4330 * See if we've hit the start. If so insert.4331 */4332 if (skb1 == skb_peek(&sk->receive_queue))
4333 {4334 skb_queue_head(&sk->receive_queue, skb);
4335 break;
4336 }4337 }4338 }4339
4340 /*4341 * Figure out what the ack value for this frame is4342 */4343
4344 if (before(sk->acked_seq, sk->copied_seq))
4345 {4346 printk("*** tcp.c:tcp_data bug acked < copied\n");
4347 sk->acked_seq = sk->copied_seq;
4348 }4349
4350 /*4351 * Now figure out if we can ack anything. This is very messy because we really want two4352 * receive queues, a completed and an assembly queue. We also want only one transmit4353 * queue.4354 */4355
4356 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
4357 {4358 if (before(skb->seq, sk->acked_seq+1))
4359 {4360
4361 if (after(skb->end_seq, sk->acked_seq))
4362 sk->acked_seq = skb->end_seq;
4363
4364 skb->acked = 1;
4365
4366 /*4367 * When we ack the fin, we do the FIN 4368 * processing.4369 */4370
4371 if (skb->h.th->fin)
4372 {4373 tcp_fin(skb,sk,skb->h.th);
4374 }4375
4376 for(skb2 = skb->next;
4377 skb2 != (structsk_buff *)&sk->receive_queue;
4378 skb2 = skb2->next)
4379 {4380 if (before(skb2->seq, sk->acked_seq+1))
4381 {4382 if (after(skb2->end_seq, sk->acked_seq))
4383 sk->acked_seq = skb2->end_seq;
4384
4385 skb2->acked = 1;
4386 /*4387 * When we ack the fin, we do4388 * the fin handling.4389 */4390 if (skb2->h.th->fin)
4391 {4392 tcp_fin(skb,sk,skb->h.th);
4393 }4394
4395 /*4396 * Force an immediate ack.4397 */4398
4399 sk->ack_backlog = sk->max_ack_backlog;
4400 }4401 else4402 {4403 break;
4404 }4405 }4406
4407 /*4408 * This also takes care of updating the window.4409 * This if statement needs to be simplified.4410 *4411 * rules for delaying an ack:4412 * - delay time <= 0.5 HZ4413 * - we don't have a window update to send4414 * - must send at least every 2 full sized packets4415 */4416 if (!sk->delay_acks ||
4417 sk->ack_backlog >= sk->max_ack_backlog ||
4418 sk->bytes_rcv > sk->max_unacked || th->fin ||
4419 sk->ato > HZ/2 ||
4420 tcp_raise_window(sk)) {4421 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4422 }4423 else4424 {4425 sk->ack_backlog++;
4426
4427 if(sk->debug)
4428 printk("Ack queued.\n");
4429 reset_xmit_timer(sk, TIME_WRITE, sk->ato);
4430 }4431 }4432 }4433
4434 /*4435 * If we've missed a packet, send an ack.4436 * Also start a timer to send another.4437 */4438
4439 if (!skb->acked)
4440 {4441
4442 /*4443 * This is important. If we don't have much room left,4444 * we need to throw out a few packets so we have a good4445 * window. Note that mtu is used, not mss, because mss is really4446 * for the send side. He could be sending us stuff as large as mtu.4447 */4448
4449 while (sock_rspace(sk) < sk->mtu)
4450 {4451 skb1 = skb_peek(&sk->receive_queue);
4452 if (skb1 == NULL)
4453 {4454 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4455 break;
4456 }4457
4458 /*4459 * Don't throw out something that has been acked. 4460 */4461
4462 if (skb1->acked)
4463 {4464 break;
4465 }4466
4467 skb_unlink(skb1);
4468 kfree_skb(skb1, FREE_READ);
4469 }4470 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4471 sk->ack_backlog++;
4472 reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
4473 }4474 else4475 {4476 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4477 }4478
4479 /*4480 * Now tell the user we may have some data. 4481 */4482
4483 if (!sk->dead)
4484 {4485 if(sk->debug)
4486 printk("Data wakeup.\n");
4487 sk->data_ready(sk,0);
4488 }4489 return(0);
4490 }4491
4492
4493 /*4494 * This routine is only called when we have urgent data4495 * signalled. Its the 'slow' part of tcp_urg. It could be4496 * moved inline now as tcp_urg is only called from one4497 * place. We handle URGent data wrong. We have to - as4498 * BSD still doesn't use the correction from RFC961.4499 */4500
4501 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4502 {4503 u32ptr = ntohs(th->urg_ptr);
4504
4505 if (ptr)
4506 ptr--;
4507 ptr += ntohl(th->seq);
4508
4509 /* ignore urgent data that we've already seen and read */4510 if (after(sk->copied_seq, ptr))
4511 return;
4512
4513 /* do we already have a newer (or duplicate) urgent pointer? */4514 if (sk->urg_data && !after(ptr, sk->urg_seq))
4515 return;
4516
4517 /* tell the world about our new urgent pointer */4518 if (sk->proc != 0) {4519 if (sk->proc > 0) {4520 kill_proc(sk->proc, SIGURG, 1);
4521 }else{4522 kill_pg(-sk->proc, SIGURG, 1);
4523 }4524 }4525 sk->urg_data = URG_NOTYET;
4526 sk->urg_seq = ptr;
4527 }4528
4529 /*4530 * This is the 'fast' part of urgent handling.4531 */4532
4533 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4534 unsignedlongsaddr, unsignedlonglen)
4535 {4536 u32ptr;
4537
4538 /*4539 * Check if we get a new urgent pointer - normally not 4540 */4541
4542 if (th->urg)
4543 tcp_check_urg(sk,th);
4544
4545 /*4546 * Do we wait for any urgent data? - normally not4547 */4548
4549 if (sk->urg_data != URG_NOTYET)
4550 return 0;
4551
4552 /*4553 * Is the urgent pointer pointing into this packet? 4554 */4555
4556 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4557 if (ptr >= len)
4558 return 0;
4559
4560 /*4561 * Ok, got the correct packet, update info 4562 */4563
4564 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4565 if (!sk->dead)
4566 sk->data_ready(sk,0);
4567 return 0;
4568 }4569
4570 /*4571 * This will accept the next outstanding connection. 4572 */4573
4574 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4575 {4576 structsock *newsk;
4577 structsk_buff *skb;
4578
4579 /*4580 * We need to make sure that this socket is listening,4581 * and that it has something pending.4582 */4583
4584 if (sk->state != TCP_LISTEN)
4585 {4586 sk->err = EINVAL;
4587 return(NULL);
4588 }4589
4590 /* Avoid the race. */4591 cli();
4592 sk->inuse = 1;
4593
4594 while((skb = tcp_dequeue_established(sk)) == NULL)
4595 {4596 if (flags & O_NONBLOCK)
4597 {4598 sti();
4599 release_sock(sk);
4600 sk->err = EAGAIN;
4601 return(NULL);
4602 }4603
4604 release_sock(sk);
4605 interruptible_sleep_on(sk->sleep);
4606 if (current->signal & ~current->blocked)
4607 {4608 sti();
4609 sk->err = ERESTARTSYS;
4610 return(NULL);
4611 }4612 sk->inuse = 1;
4613 }4614 sti();
4615
4616 /*4617 * Now all we need to do is return skb->sk. 4618 */4619
4620 newsk = skb->sk;
4621
4622 kfree_skb(skb, FREE_READ);
4623 sk->ack_backlog--;
4624 release_sock(sk);
4625 return(newsk);
4626 }4627
4628
4629 /*4630 * This will initiate an outgoing connection. 4631 */4632
4633 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4634 {4635 structsk_buff *buff;
4636 structdevice *dev=NULL;
4637 unsignedchar *ptr;
4638 inttmp;
4639 intatype;
4640 structtcphdr *t1;
4641 structrtable *rt;
4642
4643 if (sk->state != TCP_CLOSE)
4644 return(-EISCONN);
4645
4646 /*4647 * Don't allow a double connect.4648 */4649
4650 if(sk->daddr)
4651 return -EINVAL;
4652
4653 if (addr_len < 8)
4654 return(-EINVAL);
4655
4656 if (usin->sin_family && usin->sin_family != AF_INET)
4657 return(-EAFNOSUPPORT);
4658
4659 /*4660 * connect() to INADDR_ANY means loopback (BSD'ism).4661 */4662
4663 if(usin->sin_addr.s_addr==INADDR_ANY)
4664 usin->sin_addr.s_addr=ip_my_addr();
4665
4666 /*4667 * Don't want a TCP connection going to a broadcast address 4668 */4669
4670 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4671 return -ENETUNREACH;
4672
4673 sk->inuse = 1;
4674 sk->daddr = usin->sin_addr.s_addr;
4675 sk->write_seq = tcp_init_seq();
4676 sk->window_seq = sk->write_seq;
4677 sk->rcv_ack_seq = sk->write_seq -1;
4678 sk->err = 0;
4679 sk->dummy_th.dest = usin->sin_port;
4680 release_sock(sk);
4681
4682 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4683 if (buff == NULL)
4684 {4685 return(-ENOMEM);
4686 }4687 sk->inuse = 1;
4688 buff->sk = sk;
4689 buff->free = 0;
4690 buff->localroute = sk->localroute;
4691
4692
4693 /*4694 * Put in the IP header and routing stuff.4695 */4696
4697 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4698 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4699 if (tmp < 0)
4700 {4701 sock_wfree(sk, buff);
4702 release_sock(sk);
4703 return(-ENETUNREACH);
4704 }4705 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4706 sk->saddr = rt->rt_src;
4707 sk->rcv_saddr = sk->saddr;
4708
4709 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
4710
4711 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4712 buff->seq = sk->write_seq++;
4713 t1->seq = htonl(buff->seq);
4714 sk->sent_seq = sk->write_seq;
4715 buff->end_seq = sk->write_seq;
4716 t1->ack = 0;
4717 t1->window = 2;
4718 t1->res1=0;
4719 t1->res2=0;
4720 t1->rst = 0;
4721 t1->urg = 0;
4722 t1->psh = 0;
4723 t1->syn = 1;
4724 t1->urg_ptr = 0;
4725 t1->doff = 6;
4726 /* use 512 or whatever user asked for */4727
4728 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4729 sk->window_clamp=rt->rt_window;
4730 else4731 sk->window_clamp=0;
4732
4733 if (sk->user_mss)
4734 sk->mtu = sk->user_mss;
4735 elseif (rt)
4736 sk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
4737 else4738 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
4739
4740 /*4741 * but not bigger than device MTU 4742 */4743
4744 if(sk->mtu <32)
4745 sk->mtu = 32; /* Sanity limit */4746
4747 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
4748
4749 #ifdefCONFIG_SKIP4750
4751 /*4752 * SKIP devices set their MTU to 65535. This is so they can take packets4753 * unfragmented to security process then fragment. They could lie to the4754 * TCP layer about a suitable MTU, but its easier to let skip sort it out4755 * simply because the final package we want unfragmented is going to be4756 *4757 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]4758 */4759
4760 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */4761 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4762 #endif4763
4764 /*4765 * Put in the TCP options to say MTU. 4766 */4767
4768 ptr = skb_put(buff,4);
4769 ptr[0] = 2;
4770 ptr[1] = 4;
4771 ptr[2] = (sk->mtu) >> 8;
4772 ptr[3] = (sk->mtu) & 0xff;
4773 tcp_send_check(t1, sk->saddr, sk->daddr,
4774 sizeof(structtcphdr) + 4, sk);
4775
4776 /*4777 * This must go first otherwise a really quick response will get reset. 4778 */4779
4780 tcp_cache_zap();
4781 tcp_set_state(sk,TCP_SYN_SENT);
4782 if(rt&&rt->rt_flags&RTF_IRTT)
4783 sk->rto = rt->rt_irtt;
4784 else4785 sk->rto = TCP_TIMEOUT_INIT;
4786 sk->retransmit_timer.function=&retransmit_timer;
4787 sk->retransmit_timer.data = (unsignedlong)sk;
4788 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4789 sk->retransmits = 0; /* Now works the right way instead of a hacked 4790 initial setting */4791
4792 sk->prot->queue_xmit(sk, dev, buff, 0);
4793 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4794 tcp_statistics.TcpActiveOpens++;
4795 tcp_statistics.TcpOutSegs++;
4796
4797 release_sock(sk);
4798 return(0);
4799 }4800
4801
4802 /*4803 * This functions checks to see if the tcp header is actually acceptable. 4804 */4805
4806 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4807 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4808 {4809 u32next_seq;
4810
4811 next_seq = len - 4*th->doff;
4812 if (th->fin)
4813 next_seq++;
4814 /* if we have a zero window, we can't have any data in the packet.. */4815 if (next_seq && !sk->window)
4816 gotoignore_it;
4817 next_seq += ntohl(th->seq);
4818
4819 /*4820 * This isn't quite right. sk->acked_seq could be more recent4821 * than sk->window. This is however close enough. We will accept4822 * slightly more packets than we should, but it should not cause4823 * problems unless someone is trying to forge packets.4824 */4825
4826 /* have we already seen all of this packet? */4827 if (!after(next_seq+1, sk->acked_seq))
4828 gotoignore_it;
4829 /* or does it start beyond the window? */4830 if (!before(ntohl(th->seq), sk->acked_seq + sk->window + 1))
4831 gotoignore_it;
4832
4833 /* ok, at least part of this packet would seem interesting.. */4834 return 1;
4835
4836 ignore_it:
4837 if (th->rst)
4838 return 0;
4839
4840 /*4841 * Send a reset if we get something not ours and we are4842 * unsynchronized. Note: We don't do anything to our end. We4843 * are just killing the bogus remote connection then we will4844 * connect again and it will work (with luck).4845 */4846
4847 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4848 {4849 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4850 return 1;
4851 }4852
4853 /* Try to resync things. */4854 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4855 return 0;
4856 }4857
4858 /*4859 * When we get a reset we do this.4860 */4861
4862 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4863 {4864 sk->zapped = 1;
4865 sk->err = ECONNRESET;
4866 if (sk->state == TCP_SYN_SENT)
4867 sk->err = ECONNREFUSED;
4868 if (sk->state == TCP_CLOSE_WAIT)
4869 sk->err = EPIPE;
4870 #ifdef TCP_DO_RFC1337
4871 /*4872 * Time wait assassination protection [RFC1337]4873 */4874 if(sk->state!=TCP_TIME_WAIT)
4875 {4876 tcp_set_state(sk,TCP_CLOSE);
4877 sk->shutdown = SHUTDOWN_MASK;
4878 }4879 #else4880 tcp_set_state(sk,TCP_CLOSE);
4881 sk->shutdown = SHUTDOWN_MASK;
4882 #endif4883 if (!sk->dead)
4884 sk->state_change(sk);
4885 kfree_skb(skb, FREE_READ);
4886 release_sock(sk);
4887 return(0);
4888 }4889
4890 /*4891 * A TCP packet has arrived.4892 * skb->h.raw is the TCP header.4893 */4894
4895 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4896 __u32daddr, unsignedshortlen,
4897 __u32saddr, intredo, structinet_protocol * protocol)
4898 {4899 structtcphdr *th;
4900 structsock *sk;
4901 intsyn_ok=0;
4902
4903 tcp_statistics.TcpInSegs++;
4904 if(skb->pkt_type!=PACKET_HOST)
4905 {4906 kfree_skb(skb,FREE_READ);
4907 return(0);
4908 }4909
4910 th = skb->h.th;
4911
4912 /*4913 * Find the socket, using the last hit cache if applicable.4914 */4915
4916 if(!redo && saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4917 {4918 sk=(structsock *)th_cache_sk;
4919 /*4920 * We think this is causing the bug so4921 */4922 if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4923 printk("Cache mismatch on TCP.\n");
4924 }4925 else4926 {4927 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4928 th_cache_saddr=saddr;
4929 th_cache_daddr=daddr;
4930 th_cache_dport=th->dest;
4931 th_cache_sport=th->source;
4932 th_cache_sk=sk;
4933 }4934
4935 /*4936 * If this socket has got a reset it's to all intents and purposes 4937 * really dead. Count closed sockets as dead.4938 *4939 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4940 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4941 * exist so should cause resets as if the port was unreachable.4942 */4943
4944 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4945 sk=NULL;
4946
4947 if (!redo)
4948 {4949 /*4950 * Pull up the IP header.4951 */4952 skb_pull(skb, skb->h.raw-skb->data);
4953 /*4954 * Try to use the device checksum if provided.4955 */4956 if (
4957 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4958 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4959 )
4960 {4961 skb->sk = NULL;
4962 kfree_skb(skb,FREE_READ);
4963 /*4964 * We don't release the socket because it was4965 * never marked in use.4966 */4967 return(0);
4968 }4969
4970 skb->seq = ntohl(th->seq);
4971 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4972 skb->ack_seq = ntohl(th->ack_seq);
4973
4974 /* See if we know about the socket. */4975 if (sk == NULL)
4976 {4977 /*4978 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4979 */4980 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4981 skb->sk = NULL;
4982 /*4983 * Discard frame4984 */4985 kfree_skb(skb, FREE_READ);
4986 return(0);
4987 }4988
4989 skb->acked = 0;
4990 skb->used = 0;
4991 skb->free = 0;
4992 skb->saddr = daddr;
4993 skb->daddr = saddr;
4994
4995 /* We may need to add it to the backlog here. */4996 cli();
4997 if (sk->inuse)
4998 {4999 skb_queue_tail(&sk->back_log, skb);
5000 sti();
5001 return(0);
5002 }5003 sk->inuse = 1;
5004 sti();
5005 }5006 else5007 {5008 if (sk==NULL)
5009 {5010 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
5011 skb->sk = NULL;
5012 kfree_skb(skb, FREE_READ);
5013 return(0);
5014 }5015 }5016
5017
5018 if (!sk->prot)
5019 {5020 printk("IMPOSSIBLE 3\n");
5021 return(0);
5022 }5023
5024
5025 /*5026 * Charge the memory to the socket. 5027 */5028
5029 skb->sk=sk;
5030 sk->rmem_alloc += skb->truesize;
5031
5032 /*5033 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We5034 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug5035 * compatibility. We also set up variables more thoroughly [Karn notes in the5036 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].5037 */5038
5039 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */5040 {5041
5042 /*5043 * Now deal with unusual cases.5044 */5045
5046 if(sk->state==TCP_LISTEN)
5047 {5048 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */5049 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
5050
5051 /*5052 * We don't care for RST, and non SYN are absorbed (old segments)5053 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the5054 * netmask on a running connection it can go broadcast. Even Sun's have5055 * this problem so I'm ignoring it 5056 */5057
5058 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
5059 {5060 kfree_skb(skb, FREE_READ);
5061 release_sock(sk);
5062 return 0;
5063 }5064
5065 /* 5066 * Guess we need to make a new socket up 5067 */5068
5069 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
5070
5071 /*5072 * Now we have several options: In theory there is nothing else5073 * in the frame. KA9Q has an option to send data with the syn,5074 * BSD accepts data with the syn up to the [to be] advertised window5075 * and Solaris 2.1 gives you a protocol error. For now we just ignore5076 * it, that fits the spec precisely and avoids incompatibilities. It5077 * would be nice in future to drop through and process the data.5078 */5079
5080 release_sock(sk);
5081 return 0;
5082 }5083
5084 /* retransmitted SYN? */5085 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
5086 {5087 kfree_skb(skb, FREE_READ);
5088 release_sock(sk);
5089 return 0;
5090 }5091
5092 /*5093 * SYN sent means we have to look for a suitable ack and either reset5094 * for bad matches or go to connected 5095 */5096
5097 if(sk->state==TCP_SYN_SENT)
5098 {5099 /* Crossed SYN or previous junk segment */5100 if(th->ack)
5101 {5102 /* We got an ack, but it's not a good ack */5103 if(!tcp_ack(sk,th,saddr,len))
5104 {5105 /* Reset the ack - its an ack from a 5106 different connection [ th->rst is checked in tcp_reset()] */5107 tcp_statistics.TcpAttemptFails++;
5108 tcp_reset(daddr, saddr, th,
5109 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5110 kfree_skb(skb, FREE_READ);
5111 release_sock(sk);
5112 return(0);
5113 }5114 if(th->rst)
5115 returntcp_std_reset(sk,skb);
5116 if(!th->syn)
5117 {5118 /* A valid ack from a different connection5119 start. Shouldn't happen but cover it */5120 tcp_statistics.TcpAttemptFails++;
5121 tcp_reset(daddr, saddr, th,
5122 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5123 kfree_skb(skb, FREE_READ);
5124 release_sock(sk);
5125 return 0;
5126 }5127 /*5128 * Ok.. it's good. Set up sequence numbers and5129 * move to established.5130 */5131 syn_ok=1; /* Don't reset this connection for the syn */5132 sk->acked_seq = skb->seq+1;
5133 sk->lastwin_seq = skb->seq+1;
5134 sk->fin_seq = skb->seq;
5135 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5136 tcp_set_state(sk, TCP_ESTABLISHED);
5137 tcp_options(sk,th);
5138 sk->dummy_th.dest=th->source;
5139 sk->copied_seq = sk->acked_seq;
5140 if(!sk->dead)
5141 {5142 sk->state_change(sk);
5143 sock_wake_async(sk->socket, 0);
5144 }5145 if(sk->max_window==0)
5146 {5147 sk->max_window = 32;
5148 sk->mss = min(sk->max_window, sk->mtu);
5149 }5150 }5151 else5152 {5153 /* See if SYN's cross. Drop if boring */5154 if(th->syn && !th->rst)
5155 {5156 /* Crossed SYN's are fine - but talking to5157 yourself is right out... */5158 if(sk->saddr==saddr && sk->daddr==daddr &&
5159 sk->dummy_th.source==th->source &&
5160 sk->dummy_th.dest==th->dest)
5161 {5162 tcp_statistics.TcpAttemptFails++;
5163 returntcp_std_reset(sk,skb);
5164 }5165 tcp_set_state(sk,TCP_SYN_RECV);
5166
5167 /*5168 * FIXME:5169 * Must send SYN|ACK here5170 */5171 }5172 /* Discard junk segment */5173 kfree_skb(skb, FREE_READ);
5174 release_sock(sk);
5175 return 0;
5176 }5177 /*5178 * SYN_RECV with data maybe.. drop through5179 */5180 gotorfc_step6;
5181 }5182
5183 /*5184 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is5185 * a more complex suggestion for fixing these reuse issues in RFC16445186 * but not yet ready for general use. Also see RFC1379.5187 */5188
5189 #defineBSD_TIME_WAIT5190 #ifdefBSD_TIME_WAIT5191 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5192 after(skb->seq, sk->acked_seq) && !th->rst)
5193 {5194 u32seq = sk->write_seq;
5195 if(sk->debug)
5196 printk("Doing a BSD time wait\n");
5197 tcp_statistics.TcpEstabResets++;
5198 sk->rmem_alloc -= skb->truesize;
5199 skb->sk = NULL;
5200 sk->err=ECONNRESET;
5201 tcp_set_state(sk, TCP_CLOSE);
5202 sk->shutdown = SHUTDOWN_MASK;
5203 release_sock(sk);
5204 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5205 if (sk && sk->state==TCP_LISTEN)
5206 {5207 sk->inuse=1;
5208 skb->sk = sk;
5209 sk->rmem_alloc += skb->truesize;
5210 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5211 release_sock(sk);
5212 return 0;
5213 }5214 kfree_skb(skb, FREE_READ);
5215 return 0;
5216 }5217 #endif5218 }5219
5220 /*5221 * We are now in normal data flow (see the step list in the RFC)5222 * Note most of these are inline now. I'll inline the lot when5223 * I have time to test it hard and look at what gcc outputs 5224 */5225
5226 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5227 {5228 kfree_skb(skb, FREE_READ);
5229 release_sock(sk);
5230 return 0;
5231 }5232
5233 if(th->rst)
5234 returntcp_std_reset(sk,skb);
5235
5236 /*5237 * !syn_ok is effectively the state test in RFC793.5238 */5239
5240 if(th->syn && !syn_ok)
5241 {5242 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5243 returntcp_std_reset(sk,skb);
5244 }5245
5246
5247 /*5248 * Delayed ACK time estimator.5249 */5250
5251 if (sk->lrcvtime == 0)
5252 {5253 sk->lrcvtime = jiffies;
5254 sk->ato = HZ/3;
5255 }5256 else5257 {5258 intm;
5259
5260 m = jiffies - sk->lrcvtime;
5261
5262 sk->lrcvtime = jiffies;
5263
5264 if (m <= 0)
5265 m = 1;
5266
5267 if (m > (sk->rtt >> 3))
5268 {5269 sk->ato = sk->rtt >> 3;
5270 /*5271 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);5272 */5273 }5274 else5275 {5276 sk->ato = (sk->ato >> 1) + m;
5277 /*5278 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);5279 */5280 }5281 }5282
5283 /*5284 * Process the ACK5285 */5286
5287
5288 if(th->ack && !tcp_ack(sk,th,saddr,len))
5289 {5290 /*5291 * Our three way handshake failed.5292 */5293
5294 if(sk->state==TCP_SYN_RECV)
5295 {5296 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5297 }5298 kfree_skb(skb, FREE_READ);
5299 release_sock(sk);
5300 return 0;
5301 }5302
5303 rfc_step6: /* I'll clean this up later */5304
5305 /*5306 * If the accepted buffer put us over our queue size we5307 * now drop it (we must process the ack first to avoid5308 * deadlock cases).5309 */5310
5311 if (sk->rmem_alloc >= sk->rcvbuf)
5312 {5313 kfree_skb(skb, FREE_READ);
5314 release_sock(sk);
5315 return(0);
5316 }5317
5318
5319 /*5320 * Process urgent data5321 */5322
5323 if(tcp_urg(sk, th, saddr, len))
5324 {5325 kfree_skb(skb, FREE_READ);
5326 release_sock(sk);
5327 return 0;
5328 }5329
5330 /*5331 * Process the encapsulated data5332 */5333
5334 if(tcp_data(skb,sk, saddr, len))
5335 {5336 kfree_skb(skb, FREE_READ);
5337 release_sock(sk);
5338 return 0;
5339 }5340
5341 /*5342 * And done5343 */5344
5345 release_sock(sk);
5346 return 0;
5347 }5348
5349 /*5350 * This routine sends a packet with an out of date sequence5351 * number. It assumes the other end will try to ack it.5352 */5353
5354 staticvoidtcp_write_wakeup(structsock *sk)
/* */5355 {5356 structsk_buff *buff,*skb;
5357 structtcphdr *t1;
5358 structdevice *dev=NULL;
5359 inttmp;
5360
5361 if (sk->zapped)
5362 return; /* After a valid reset we can send no more */5363
5364 /*5365 * Write data can still be transmitted/retransmitted in the5366 * following states. If any other state is encountered, return.5367 * [listen/close will never occur here anyway]5368 */5369
5370 if (sk->state != TCP_ESTABLISHED &&
5371 sk->state != TCP_CLOSE_WAIT &&
5372 sk->state != TCP_FIN_WAIT1 &&
5373 sk->state != TCP_LAST_ACK &&
5374 sk->state != TCP_CLOSING5375 )
5376 {5377 return;
5378 }5379 if ( before(sk->sent_seq, sk->window_seq) &&
5380 (skb=skb_peek(&sk->write_queue)))
5381 {5382 /*5383 * We are probing the opening of a window5384 * but the window size is != 05385 * must have been a result SWS advoidance ( sender )5386 */5387
5388 structiphdr *iph;
5389 structtcphdr *th;
5390 structtcphdr *nth;
5391 unsignedlongwin_size;
5392 #if 0
5393 unsignedlong ow_size;
5394 #endif5395 void * tcp_data_start;
5396
5397 /*5398 * How many bytes can we send ?5399 */5400
5401 win_size = sk->window_seq - sk->sent_seq;
5402
5403 /*5404 * Recover the buffer pointers5405 */5406
5407 iph = (structiphdr *)skb->ip_hdr;
5408 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
5409
5410 /*5411 * Grab the data for a temporary frame5412 */5413
5414 buff = sock_wmalloc(sk, win_size + th->doff * 4 +
5415 (iph->ihl << 2) +
5416 sk->prot->max_header + 15,
5417 1, GFP_ATOMIC);
5418 if ( buff == NULL )
5419 return;
5420
5421 /* 5422 * If we strip the packet on the write queue we must5423 * be ready to retransmit this one 5424 */5425
5426 buff->free = /*0*/1;
5427
5428 buff->sk = sk;
5429 buff->localroute = sk->localroute;
5430
5431 /*5432 * Put headers on the new packet5433 */5434
5435 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5436 IPPROTO_TCP, sk->opt, buff->truesize,
5437 sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5438 if (tmp < 0)
5439 {5440 sock_wfree(sk, buff);
5441 return;
5442 }5443
5444 /*5445 * Move the TCP header over5446 */5447
5448 buff->dev = dev;
5449
5450 nth = (structtcphdr *) skb_put(buff,th->doff*4);
5451
5452 memcpy(nth, th, th->doff * 4);
5453
5454 /*5455 * Correct the new header5456 */5457
5458 nth->ack = 1;
5459 nth->ack_seq = htonl(sk->acked_seq);
5460 nth->window = htons(tcp_select_window(sk));
5461 nth->check = 0;
5462
5463 /*5464 * Find the first data byte.5465 */5466
5467 tcp_data_start = (char *) th + (th->doff << 2);
5468
5469 /*5470 * Add it to our new buffer5471 */5472
5473 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5474
5475 /*5476 * Remember our right edge sequence number.5477 */5478
5479 buff->end_seq = sk->sent_seq + win_size;
5480 sk->sent_seq = buff->end_seq; /* Hack */5481 if(th->urg && ntohs(th->urg_ptr) < win_size)
5482 nth->urg = 0;
5483
5484 /*5485 * Checksum the split buffer5486 */5487
5488 tcp_send_check(nth, sk->saddr, sk->daddr,
5489 nth->doff * 4 + win_size , sk);
5490 }5491 else5492 {5493 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5494 if (buff == NULL)
5495 return;
5496
5497 buff->free = 1;
5498 buff->sk = sk;
5499 buff->localroute = sk->localroute;
5500
5501 /*5502 * Put in the IP header and routing stuff. 5503 */5504
5505 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5506 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5507 if (tmp < 0)
5508 {5509 sock_wfree(sk, buff);
5510 return;
5511 }5512
5513 t1 = (structtcphdr *)skb_put(buff,sizeof(structtcphdr));
5514 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5515
5516 /*5517 * Use a previous sequence.5518 * This should cause the other end to send an ack.5519 */5520
5521 t1->seq = htonl(sk->sent_seq-1);
5522 t1->ack = 1;
5523 t1->res1= 0;
5524 t1->res2= 0;
5525 t1->rst = 0;
5526 t1->urg = 0;
5527 t1->psh = 0;
5528 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5529 t1->syn = 0;
5530 t1->ack_seq = htonl(sk->acked_seq);
5531 t1->window = htons(tcp_select_window(sk));
5532 t1->doff = sizeof(*t1)/4;
5533 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5534
5535 }5536
5537 /*5538 * Send it.5539 */5540
5541 sk->prot->queue_xmit(sk, dev, buff, 1);
5542 tcp_statistics.TcpOutSegs++;
5543 }5544
5545 /*5546 * A window probe timeout has occurred.5547 */5548
5549 voidtcp_send_probe0(structsock *sk)
/* */5550 {5551 if (sk->zapped)
5552 return; /* After a valid reset we can send no more */5553
5554 tcp_write_wakeup(sk);
5555
5556 sk->backoff++;
5557 sk->rto = min(sk->rto << 1, 120*HZ);
5558 sk->retransmits++;
5559 sk->prot->retransmits ++;
5560 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5561 }5562
5563 /*5564 * Socket option code for TCP. 5565 */5566
5567 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5568 {5569 intval,err;
5570
5571 if(level!=SOL_TCP)
5572 returnip_setsockopt(sk,level,optname,optval,optlen);
5573
5574 if (optval == NULL)
5575 return(-EINVAL);
5576
5577 err=verify_area(VERIFY_READ, optval, sizeof(int));
5578 if(err)
5579 returnerr;
5580
5581 val = get_user((int *)optval);
5582
5583 switch(optname)
5584 {5585 caseTCP_MAXSEG:
5586 /*5587 * values greater than interface MTU won't take effect. however at5588 * the point when this call is done we typically don't yet know5589 * which interface is going to be used5590 */5591 if(val<1||val>MAX_WINDOW)
5592 return -EINVAL;
5593 sk->user_mss=val;
5594 return 0;
5595 caseTCP_NODELAY:
5596 sk->nonagle=(val==0)?0:1;
5597 return 0;
5598 default:
5599 return(-ENOPROTOOPT);
5600 }5601 }5602
5603 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5604 {5605 intval,err;
5606
5607 if(level!=SOL_TCP)
5608 returnip_getsockopt(sk,level,optname,optval,optlen);
5609
5610 switch(optname)
5611 {5612 caseTCP_MAXSEG:
5613 val=sk->user_mss;
5614 break;
5615 caseTCP_NODELAY:
5616 val=sk->nonagle;
5617 break;
5618 default:
5619 return(-ENOPROTOOPT);
5620 }5621 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5622 if(err)
5623 returnerr;
5624 put_user(sizeof(int),(int *) optlen);
5625
5626 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5627 if(err)
5628 returnerr;
5629 put_user(val,(int *)optval);
5630
5631 return(0);
5632 }5633
5634
5635 structprototcp_prot = {5636 tcp_close,
5637 ip_build_header,
5638 tcp_connect,
5639 tcp_accept,
5640 ip_queue_xmit,
5641 tcp_retransmit,
5642 tcp_write_wakeup,
5643 tcp_read_wakeup,
5644 tcp_rcv,
5645 tcp_select,
5646 tcp_ioctl,
5647 NULL,
5648 tcp_shutdown,
5649 tcp_setsockopt,
5650 tcp_getsockopt,
5651 tcp_sendmsg,
5652 tcp_recvmsg,
5653 NULL, /* No special bind() */5654 128,
5655 0,
5656 "TCP",
5657 0, 0,
5658 {NULL,}5659 };