1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications. 178 * Alan Cox : rcv_saddr errors. 179 * Alan Cox : Block double connect(). 180 * Alan Cox : Small hooks for enSKIP. 181 * Alexey Kuznetsov: Path MTU discovery. 182 * Alan Cox : Support soft errors. 183 * Alan Cox : Fix MTU discovery pathalogical case 184 * when the remote claims no mtu! 185 * Marc Tamsky : TCP_CLOSE fix. 186 * Colin (G3TNE) : Send a reset on syn ack replies in 187 * window but wrong (fixes NT lpd problems) 188 * Pedro Roque : Better TCP window handling, delayed ack. 189 * Joerg Reuter : No modification of locked buffers in 190 * tcp_do_retransmit() 191 * 192 * To Fix: 193 * Fast path the code. Two things here - fix the window calculation 194 * so it doesn't iterate over the queue, also spot packets with no funny 195 * options arriving in order and process directly. 196 * 197 * Rewrite output state machine to use a single queue. 198 * Speed up input assembly algorithm. 199 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 200 * could do with it working on IPv4 201 * User settable/learned rtt/max window/mtu 202 * 203 * Change the fundamental structure to a single send queue maintained 204 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 205 * active routes too]). Cut the queue off in tcp_retransmit/ 206 * tcp_transmit. 207 * Change the receive queue to assemble as it goes. This lets us 208 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 209 * tcp_data/tcp_read as well as the window shrink crud. 210 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 211 * tcp_queue_skb seem obvious routines to extract. 212 * 213 * This program is free software; you can redistribute it and/or 214 * modify it under the terms of the GNU General Public License 215 * as published by the Free Software Foundation; either version 216 * 2 of the License, or(at your option) any later version. 217 * 218 * Description of States: 219 * 220 * TCP_SYN_SENT sent a connection request, waiting for ack 221 * 222 * TCP_SYN_RECV received a connection request, sent ack, 223 * waiting for final ack in three-way handshake. 224 * 225 * TCP_ESTABLISHED connection established 226 * 227 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 228 * transmission of remaining buffered data 229 * 230 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 231 * to shutdown 232 * 233 * TCP_CLOSING both sides have shutdown but we still have 234 * data we have to finish sending 235 * 236 * TCP_TIME_WAIT timeout to catch resent junk before entering 237 * closed, can only be entered from FIN_WAIT2 238 * or CLOSING. Required because the other end 239 * may not have gotten our last ACK causing it 240 * to retransmit the data packet (which we ignore) 241 * 242 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 243 * us to finish writing our data and to shutdown 244 * (we have to close() to move on to LAST_ACK) 245 * 246 * TCP_LAST_ACK out side has shutdown after remote has 247 * shutdown. There may still be data in our 248 * buffer that we have to finish sending 249 * 250 * TCP_CLOSE socket is finished 251 */ 252
253 /* 254 * RFC1122 status: 255 * NOTE: I'm not going to be doing comments in the code for this one except 256 * for violations and the like. tcp.c is just too big... If I say something 257 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 258 * with Alan. -- MS 950903 259 * 260 * Use of PSH (4.2.2.2) 261 * MAY aggregate data sent without the PSH flag. (does) 262 * MAY queue data received without the PSH flag. (does) 263 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 264 * MAY implement PSH on send calls. (doesn't, thus:) 265 * MUST NOT buffer data indefinitely (doesn't [1 second]) 266 * MUST set PSH on last segment (does) 267 * MAY pass received PSH to application layer (doesn't) 268 * SHOULD send maximum-sized segment whenever possible. (almost always does) 269 * 270 * Window Size (4.2.2.3, 4.2.2.16) 271 * MUST treat window size as an unsigned number (does) 272 * SHOULD treat window size as a 32-bit number (does not) 273 * MUST NOT shrink window once it is offered (does not normally) 274 * 275 * Urgent Pointer (4.2.2.4) 276 * **MUST point urgent pointer to last byte of urgent data (not right 277 * after). (doesn't, to be like BSD) 278 * MUST inform application layer asynchronously of incoming urgent 279 * data. (does) 280 * MUST provide application with means of determining the amount of 281 * urgent data pending. (does) 282 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 283 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 284 * [Follows BSD 1 byte of urgent data] 285 * 286 * TCP Options (4.2.2.5) 287 * MUST be able to receive TCP options in any segment. (does) 288 * MUST ignore unsupported options (does) 289 * 290 * Maximum Segment Size Option (4.2.2.6) 291 * MUST implement both sending and receiving MSS. (does) 292 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send 293 * it always). (does, even when MSS == 536, which is legal) 294 * MUST assume MSS == 536 if no MSS received at connection setup (does) 295 * MUST calculate "effective send MSS" correctly: 296 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 297 * (does - but allows operator override) 298 * 299 * TCP Checksum (4.2.2.7) 300 * MUST generate and check TCP checksum. (does) 301 * 302 * Initial Sequence Number Selection (4.2.2.8) 303 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 304 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 305 * necessary for 10Mbps networks - and harder than BSD to spoof!) 306 * 307 * Simultaneous Open Attempts (4.2.2.10) 308 * MUST support simultaneous open attempts (does) 309 * 310 * Recovery from Old Duplicate SYN (4.2.2.11) 311 * MUST keep track of active vs. passive open (does) 312 * 313 * RST segment (4.2.2.12) 314 * SHOULD allow an RST segment to contain data (does, but doesn't do 315 * anything with it, which is standard) 316 * 317 * Closing a Connection (4.2.2.13) 318 * MUST inform application of whether connectin was closed by RST or 319 * normal close. (does) 320 * MAY allow "half-duplex" close (treat connection as closed for the 321 * local app, even before handshake is done). (does) 322 * MUST linger in TIME_WAIT for 2 * MSL (does) 323 * 324 * Retransmission Timeout (4.2.2.15) 325 * MUST implement Jacobson's slow start and congestion avoidance 326 * stuff. (does) 327 * 328 * Probing Zero Windows (4.2.2.17) 329 * MUST support probing of zero windows. (does) 330 * MAY keep offered window closed indefinitely. (does) 331 * MUST allow remote window to stay closed indefinitely. (does) 332 * 333 * Passive Open Calls (4.2.2.18) 334 * MUST NOT let new passive open affect other connections. (doesn't) 335 * MUST support passive opens (LISTENs) concurrently. (does) 336 * 337 * Time to Live (4.2.2.19) 338 * MUST make TCP TTL configurable. (does - IP_TTL option) 339 * 340 * Event Processing (4.2.2.20) 341 * SHOULD queue out-of-order segments. (does) 342 * MUST aggregate ACK segments whenever possible. (does but badly) 343 * 344 * Retransmission Timeout Calculation (4.2.3.1) 345 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 346 * calculation. (does, or at least explains them in the comments 8*b) 347 * SHOULD initialize RTO to 0 and RTT to 3. (does) 348 * 349 * When to Send an ACK Segment (4.2.3.2) 350 * SHOULD implement delayed ACK. (does) 351 * MUST keep ACK delay < 0.5 sec. (does) 352 * 353 * When to Send a Window Update (4.2.3.3) 354 * MUST implement receiver-side SWS. (does) 355 * 356 * When to Send Data (4.2.3.4) 357 * MUST implement sender-side SWS. (does) 358 * SHOULD implement Nagle algorithm. (does) 359 * 360 * TCP Connection Failures (4.2.3.5) 361 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 362 * SHOULD inform application layer of soft errors. (does) 363 * 364 * TCP Keep-Alives (4.2.3.6) 365 * MAY provide keep-alives. (does) 366 * MUST make keep-alives configurable on a per-connection basis. (does) 367 * MUST default to no keep-alives. (does) 368 * **MUST make keep-alive interval configurable. (doesn't) 369 * **MUST make default keep-alive interval > 2 hours. (doesn't) 370 * MUST NOT interpret failure to ACK keep-alive packet as dead 371 * connection. (doesn't) 372 * SHOULD send keep-alive with no data. (does) 373 * 374 * TCP Multihoming (4.2.3.7) 375 * MUST get source address from IP layer before sending first 376 * SYN. (does) 377 * MUST use same local address for all segments of a connection. (does) 378 * 379 * IP Options (4.2.3.8) 380 * MUST ignore unsupported IP options. (does) 381 * MAY support Time Stamp and Record Route. (does) 382 * MUST allow application to specify a source route. (does) 383 * MUST allow receieved Source Route option to set route for all future 384 * segments on this connection. (does not (security issues)) 385 * 386 * ICMP messages (4.2.3.9) 387 * MUST act on ICMP errors. (does) 388 * MUST slow transmission upon receipt of a Source Quench. (does) 389 * MUST NOT abort connection upon receipt of soft Destination 390 * Unreachables (0, 1, 5), Time Exceededs and Parameter 391 * Problems. (doesn't) 392 * SHOULD report soft Destination Unreachables etc. to the 393 * application. (does) 394 * SHOULD abort connection upon receipt of hard Destination Unreachable 395 * messages (2, 3, 4). (does) 396 * 397 * Remote Address Validation (4.2.3.10) 398 * MUST reject as an error OPEN for invalid remote IP address. (does) 399 * MUST ignore SYN with invalid source address. (does) 400 * MUST silently discard incoming SYN for broadcast/multicast 401 * address. (does) 402 * 403 * Asynchronous Reports (4.2.4.1) 404 * MUST provide mechanism for reporting soft errors to application 405 * layer. (does) 406 * 407 * Type of Service (4.2.4.2) 408 * MUST allow application layer to set Type of Service. (does IP_TOS) 409 * 410 * (Whew. -- MS 950903) 411 **/ 412
413 #include <linux/types.h>
414 #include <linux/sched.h>
415 #include <linux/mm.h>
416 #include <linux/time.h>
417 #include <linux/string.h>
418 #include <linux/config.h>
419 #include <linux/socket.h>
420 #include <linux/sockios.h>
421 #include <linux/termios.h>
422 #include <linux/in.h>
423 #include <linux/fcntl.h>
424 #include <linux/inet.h>
425 #include <linux/netdevice.h>
426 #include <net/snmp.h>
427 #include <net/ip.h>
428 #include <net/protocol.h>
429 #include <net/icmp.h>
430 #include <net/tcp.h>
431 #include <net/arp.h>
432 #include <linux/skbuff.h>
433 #include <net/sock.h>
434 #include <net/route.h>
435 #include <linux/errno.h>
436 #include <linux/timer.h>
437 #include <asm/system.h>
438 #include <asm/segment.h>
439 #include <linux/mm.h>
440 #include <net/checksum.h>
441
442 /* 443 * The MSL timer is the 'normal' timer. 444 */ 445
446 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
447
448 #define SEQ_TICK 3
449 unsignedlongseq_offset;
450 structtcp_mibtcp_statistics;
451
452 /* 453 * Cached last hit socket 454 */ 455
456 volatileunsignedlongth_cache_saddr,th_cache_daddr;
457 volatileunsignedshortth_cache_dport, th_cache_sport;
458 volatilestructsock *th_cache_sk;
459
460 voidtcp_cache_zap(void)
/* */ 461 { 462 unsignedlongflags;
463 save_flags(flags);
464 cli();
465 th_cache_saddr=0;
466 th_cache_daddr=0;
467 th_cache_dport=0;
468 th_cache_sport=0;
469 th_cache_sk=NULL;
470 restore_flags(flags);
471 } 472
473 staticvoidtcp_close(structsock *sk, inttimeout);
474 staticvoidtcp_read_wakeup(structsock *sk);
475
476 /* 477 * The less said about this the better, but it works and will do for 1.2 (and 1.4 ;)) 478 */ 479
480 staticstructwait_queue *master_select_wakeup;
481
482 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 483 { 484 if (a < b)
485 return(a);
486 return(b);
487 } 488
489 #undefSTATE_TRACE 490
491 #ifdefSTATE_TRACE 492 staticchar *statename[]={ 493 "Unused","Established","Syn Sent","Syn Recv",
494 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
495 "Close Wait","Last ACK","Listen","Closing"
496 };
497 #endif 498
499 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 500 { 501 if(sk->state==TCP_ESTABLISHED)
502 tcp_statistics.TcpCurrEstab--;
503 #ifdefSTATE_TRACE 504 if(sk->debug)
505 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
506 #endif 507 /* This is a hack but it doesn't occur often and it's going to 508 be a real to fix nicely */ 509
510 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
511 { 512 wake_up_interruptible(&master_select_wakeup);
513 } 514 sk->state=state;
515 if(state==TCP_ESTABLISHED)
516 tcp_statistics.TcpCurrEstab++;
517 if(sk->state==TCP_CLOSE)
518 tcp_cache_zap();
519 } 520
521 /* 522 * This routine picks a TCP windows for a socket based on 523 * the following constraints 524 * 525 * 1. The window can never be shrunk once it is offered (RFC 793) 526 * 2. We limit memory per socket 527 */ 528
529
530 static__inline__unsignedshorttcp_select_window(structsock *sk)
/* */ 531 { 532 longfree_space = sock_rspace(sk);
533 longwindow = 0;
534
535 if (free_space > 1024)
536 free_space &= ~0x3FF; /* make free space a multiple of 1024 */ 537
538 if(sk->window_clamp)
539 free_space = min(sk->window_clamp, free_space);
540
541 /* 542 * compute the actual window i.e. 543 * old_window - received_bytes_on_that_win 544 */ 545
546 if (sk->mss == 0)
547 sk->mss = sk->mtu;
548
549 window = sk->window - (sk->acked_seq - sk->lastwin_seq);
550
551 if ( window < 0 ) { 552 window = 0;
553 printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
554 sk->window, sk->acked_seq, sk->lastwin_seq);
555 } 556
557 /* 558 * RFC 1122: 559 * "the suggested [SWS] avoidance algoritm for the receiver is to keep 560 * RECV.NEXT + RCV.WIN fixed until: 561 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" 562 * 563 * i.e. don't raise the right edge of the window until you can't raise 564 * it MSS bytes 565 */ 566
567 if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
568 window += ((free_space - window) / sk->mss) * sk->mss;
569
570 sk->window = window;
571 sk->lastwin_seq = sk->acked_seq;
572
573 returnsk->window;
574 } 575
576 /* 577 * This function returns the amount that we can raise the 578 * usable window. 579 */ 580
581 static__inline__unsignedshorttcp_raise_window(structsock *sk)
/* */ 582 { 583 longfree_space = sock_rspace(sk);
584 longwindow = 0;
585
586 if (free_space > 1024)
587 free_space &= ~0x3FF; /* make free space a multiple of 1024 */ 588
589 if(sk->window_clamp)
590 free_space = min(sk->window_clamp, free_space);
591
592 /* 593 * compute the actual window i.e. 594 * old_window - received_bytes_on_that_win 595 */ 596
597 window = sk->window - (sk->acked_seq - sk->lastwin_seq);
598
599 if (sk->mss == 0)
600 sk->mss = sk->mtu;
601
602 if ( window < 0 ) { 603 window = 0;
604 printk(KERN_DEBUG "TRW: win < 0 w=%d 1=%u 2=%u\n",
605 sk->window, sk->acked_seq, sk->lastwin_seq);
606 } 607
608 if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
609 return ((free_space - window) / sk->mss) * sk->mss;
610
611 return 0;
612 } 613
614 /* 615 * Find someone to 'accept'. Must be called with 616 * sk->inuse=1 or cli() 617 */ 618
619 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 620 { 621 structsk_buff *p=skb_peek(&s->receive_queue);
622 if(p==NULL)
623 returnNULL;
624 do 625 { 626 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
627 returnp;
628 p=p->next;
629 } 630 while(p!=(structsk_buff *)&s->receive_queue);
631 returnNULL;
632 } 633
634 /* 635 * Remove a completed connection and return it. This is used by 636 * tcp_accept() to get connections from the queue. 637 */ 638
639 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 640 { 641 structsk_buff *skb;
642 unsignedlongflags;
643 save_flags(flags);
644 cli();
645 skb=tcp_find_established(s);
646 if(skb!=NULL)
647 skb_unlink(skb); /* Take it off the queue */ 648 restore_flags(flags);
649 returnskb;
650 } 651
652 /* 653 * This routine closes sockets which have been at least partially 654 * opened, but not yet accepted. Currently it is only called by 655 * tcp_close, and timeout mirrors the value there. 656 */ 657
658 staticvoidtcp_close_pending (structsock *sk)
/* */ 659 { 660 structsk_buff *skb;
661
662 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
663 { 664 skb->sk->dead=1;
665 tcp_close(skb->sk, 0);
666 kfree_skb(skb, FREE_READ);
667 } 668 return;
669 } 670
671 /* 672 * Enter the time wait state. 673 */ 674
675 staticvoidtcp_time_wait(structsock *sk)
/* */ 676 { 677 tcp_set_state(sk,TCP_TIME_WAIT);
678 sk->shutdown = SHUTDOWN_MASK;
679 if (!sk->dead)
680 sk->state_change(sk);
681 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
682 } 683
684 /* 685 * A socket has timed out on its send queue and wants to do a 686 * little retransmitting. Currently this means TCP. 687 */ 688
689 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 690 { 691 structsk_buff * skb;
692 structproto *prot;
693 structdevice *dev;
694 intct=0;
695 structrtable *rt;
696
697 prot = sk->prot;
698 skb = sk->send_head;
699
700 while (skb != NULL)
701 { 702 structtcphdr *th;
703 structiphdr *iph;
704 intsize;
705
706 dev = skb->dev;
707 IS_SKB(skb);
708 skb->when = jiffies;
709
710 /* dl1bke 960201 - @%$$! Hope this cures strange race conditions */ 711 /* with AX.25 mode VC. (esp. DAMA) */ 712 /* if the buffer is locked we should not retransmit */ 713 /* anyway, so we don't need all the fuss to prepare */ 714 /* the buffer in this case. */ 715 /* (the skb_pull() changes skb->data while we may */ 716 /* actually try to send the data. Ough. A side */ 717 /* effect is that we'll send some unnecessary data, */ 718 /* but the alternative is desastrous... */ 719
720 if (skb_device_locked(skb))
721 break;
722
723 /* 724 * Discard the surplus MAC header 725 */ 726
727 skb_pull(skb,((unsignedchar *)skb->ip_hdr)-skb->data);
728
729 /* 730 * In general it's OK just to use the old packet. However we 731 * need to use the current ack and window fields. Urg and 732 * urg_ptr could possibly stand to be updated as well, but we 733 * don't keep the necessary data. That shouldn't be a problem, 734 * if the other end is doing the right thing. Since we're 735 * changing the packet, we have to issue a new IP identifier. 736 */ 737
738 iph = (structiphdr *)skb->data;
739 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
740 size = ntohs(iph->tot_len) - (iph->ihl<<2);
741
742 /* 743 * Note: We ought to check for window limits here but 744 * currently this is done (less efficiently) elsewhere. 745 */ 746
747 /* 748 * Put a MAC header back on (may cause ARPing) 749 */ 750
751 { 752 /* ANK: UGLY, but the bug, that was here, should be fixed. 753 */ 754 structoptions * opt = (structoptions*)skb->proto_priv;
755 rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
756 } 757
758 iph->id = htons(ip_id_count++);
759 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 760 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
761 iph->frag_off &= ~htons(IP_DF);
762 #endif 763 ip_send_check(iph);
764
765 if (rt==NULL) /* Deep poo */ 766 { 767 if(skb->sk)
768 { 769 skb->sk->err_soft=ENETUNREACH;
770 skb->sk->error_report(skb->sk);
771 } 772 } 773 else 774 { 775 dev=rt->rt_dev;
776 skb->raddr=rt->rt_gateway;
777 skb->dev=dev;
778 skb->arp=1;
779 if (rt->rt_hh)
780 { 781 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
782 if (!rt->rt_hh->hh_uptodate)
783 { 784 skb->arp = 0;
785 #ifRT_CACHE_DEBUG >= 2
786 printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
787 #endif 788 } 789 } 790 elseif (dev->hard_header)
791 { 792 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
793 skb->arp=0;
794 } 795
796 /* 797 * This is not the right way to handle this. We have to 798 * issue an up to date window and ack report with this 799 * retransmit to keep the odd buggy tcp that relies on 800 * the fact BSD does this happy. 801 * We don't however need to recalculate the entire 802 * checksum, so someone wanting a small problem to play 803 * with might like to implement RFC1141/RFC1624 and speed 804 * this up by avoiding a full checksum. 805 */ 806
807 th->ack_seq = htonl(sk->acked_seq);
808 sk->ack_backlog = 0;
809 sk->bytes_rcv = 0;
810 th->window = ntohs(tcp_select_window(sk));
811 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
812
813 /* 814 * If the interface is (still) up and running, kick it. 815 */ 816
817 if (dev->flags & IFF_UP)
818 { 819 /* 820 * If the packet is still being sent by the device/protocol 821 * below then don't retransmit. This is both needed, and good - 822 * especially with connected mode AX.25 where it stops resends 823 * occurring of an as yet unsent anyway frame! 824 * We still add up the counts as the round trip time wants 825 * adjusting. 826 */ 827 if (sk && !skb_device_locked(skb))
828 { 829 /* Remove it from any existing driver queue first! */ 830 skb_unlink(skb);
831 /* Now queue it */ 832 ip_statistics.IpOutRequests++;
833 dev_queue_xmit(skb, dev, sk->priority);
834 } 835 } 836 } 837
838 /* 839 * Count retransmissions 840 */ 841
842 ct++;
843 sk->prot->retransmits ++;
844 tcp_statistics.TcpRetransSegs++;
845
846
847 /* 848 * Only one retransmit requested. 849 */ 850
851 if (!all)
852 break;
853
854 /* 855 * This should cut it off before we send too many packets. 856 */ 857
858 if (ct >= sk->cong_window)
859 break;
860 skb = skb->link3;
861 } 862 } 863
864 /* 865 * Reset the retransmission timer 866 */ 867
868 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 869 { 870 del_timer(&sk->retransmit_timer);
871 sk->ip_xmit_timeout = why;
872 if((long)when < 0)
873 { 874 when=3;
875 printk("Error: Negative timer in xmit_timer\n");
876 } 877 sk->retransmit_timer.expires=jiffies+when;
878 add_timer(&sk->retransmit_timer);
879 } 880
881 /* 882 * This is the normal code called for timeouts. It does the retransmission 883 * and then does backoff. tcp_do_retransmit is separated out because 884 * tcp_ack needs to send stuff from the retransmit queue without 885 * initiating a backoff. 886 */ 887
888
889 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 890 { 891 tcp_do_retransmit(sk, all);
892
893 /* 894 * Increase the timeout each time we retransmit. Note that 895 * we do not increase the rtt estimate. rto is initialized 896 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 897 * that doubling rto each time is the least we can get away with. 898 * In KA9Q, Karn uses this for the first few times, and then 899 * goes to quadratic. netBSD doubles, but only goes up to *64, 900 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 901 * defined in the protocol as the maximum possible RTT. I guess 902 * we'll have to use something other than TCP to talk to the 903 * University of Mars. 904 * 905 * PAWS allows us longer timeouts and large windows, so once 906 * implemented ftp to mars will work nicely. We will have to fix 907 * the 120 second clamps though! 908 */ 909
910 sk->retransmits++;
911 sk->prot->retransmits++;
912 sk->backoff++;
913 sk->rto = min(sk->rto << 1, 120*HZ);
914 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
915 } 916
917
918 /* 919 * A timer event has trigger a tcp retransmit timeout. The 920 * socket xmit queue is ready and set up to send. Because 921 * the ack receive code keeps the queue straight we do 922 * nothing clever here. 923 */ 924
925 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 926 { 927 if (all)
928 { 929 tcp_retransmit_time(sk, all);
930 return;
931 } 932
933 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 934 /* sk->ssthresh in theory can be zero. I guess that's OK */ 935 sk->cong_count = 0;
936
937 sk->cong_window = 1;
938
939 /* Do the actual retransmit. */ 940 tcp_retransmit_time(sk, all);
941 } 942
943 /* 944 * A write timeout has occurred. Process the after effects. 945 */ 946
947 staticinttcp_write_timeout(structsock *sk)
/* */ 948 { 949 /* 950 * Look for a 'soft' timeout. 951 */ 952 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
953 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
954 { 955 /* 956 * Attempt to recover if arp has changed (unlikely!) or 957 * a route has shifted (not supported prior to 1.3). 958 */ 959 ip_rt_advice(&sk->ip_route_cache, 0);
960 } 961
962 /* 963 * Have we tried to SYN too many times (repent repent 8)) 964 */ 965
966 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
967 { 968 if(sk->err_soft)
969 sk->err=sk->err_soft;
970 else 971 sk->err=ETIMEDOUT;
972 sk->error_report(sk);
973 del_timer(&sk->retransmit_timer);
974 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ 975 tcp_set_state(sk,TCP_CLOSE);
976 /* Don't FIN, we got nothing back */ 977 release_sock(sk);
978 return 0;
979 } 980 /* 981 * Has it gone just too far ? 982 */ 983 if (sk->retransmits > TCP_RETR2)
984 { 985 if(sk->err_soft)
986 sk->err = sk->err_soft;
987 else 988 sk->err = ETIMEDOUT;
989 sk->error_report(sk);
990 del_timer(&sk->retransmit_timer);
991 /* 992 * Time wait the socket 993 */ 994 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
995 { 996 tcp_set_state(sk,TCP_TIME_WAIT);
997 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
998 } 999 else1000 {1001 /*1002 * Clean up time.1003 */1004 tcp_set_state(sk, TCP_CLOSE);
1005 release_sock(sk);
1006 return 0;
1007 }1008 }1009 return 1;
1010 }1011
1012 /*1013 * The TCP retransmit timer. This lacks a few small details.1014 *1015 * 1. An initial rtt timeout on the probe0 should cause what we can1016 * of the first write queue buffer to be split and sent.1017 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report1018 * ETIMEDOUT if we know an additional 'soft' error caused this.1019 * tcp_err should save a 'soft error' for us.1020 */1021
1022 staticvoidretransmit_timer(unsignedlongdata)
/* */1023 {1024 structsock *sk = (structsock*)data;
1025 intwhy = sk->ip_xmit_timeout;
1026
1027 /*1028 * We are reset. We will send no more retransmits.1029 */1030
1031 if(sk->zapped)
1032 return;
1033
1034 /* 1035 * Only process if socket is not in use1036 */1037
1038 cli();
1039 if (sk->inuse || in_bh)
1040 {1041 /* Try again in 1 second */1042 sk->retransmit_timer.expires = jiffies+HZ;
1043 add_timer(&sk->retransmit_timer);
1044 sti();
1045 return;
1046 }1047
1048 sk->inuse = 1;
1049 sti();
1050
1051
1052 if (sk->ack_backlog && !sk->dead)
1053 sk->data_ready(sk,0);
1054
1055 /* Now we need to figure out why the socket was on the timer. */1056
1057 switch (why)
1058 {1059 /* Window probing */1060 caseTIME_PROBE0:
1061 tcp_send_probe0(sk);
1062 tcp_write_timeout(sk);
1063 break;
1064 /* Retransmitting */1065 caseTIME_WRITE:
1066 /* It could be we got here because we needed to send an ack.1067 * So we need to check for that.1068 */1069 {1070 structsk_buff *skb;
1071 unsignedlongflags;
1072
1073 save_flags(flags);
1074 cli();
1075 skb = sk->send_head;
1076 if (!skb)
1077 {1078 if (sk->ack_backlog)
1079 tcp_read_wakeup(sk);
1080 restore_flags(flags);
1081 }1082 else1083 {1084 /*1085 * Kicked by a delayed ack. Reset timer1086 * correctly now1087 */1088 if (jiffies < skb->when + sk->rto)
1089 {1090 if (sk->ack_backlog)
1091 tcp_read_wakeup(sk);
1092 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1093 restore_flags(flags);
1094 break;
1095 }1096 restore_flags(flags);
1097 /*1098 * Retransmission1099 */1100 sk->retransmits++;
1101 sk->prot->retransmits++;
1102 sk->prot->retransmit (sk, 0);
1103 tcp_write_timeout(sk);
1104 }1105 break;
1106 }1107 /* Sending Keepalives */1108 caseTIME_KEEPOPEN:
1109 /* 1110 * this reset_timer() call is a hack, this is not1111 * how KEEPOPEN is supposed to work.1112 */1113 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1114
1115 /* Send something to keep the connection open. */1116 if (sk->prot->write_wakeup)
1117 sk->prot->write_wakeup (sk);
1118 sk->retransmits++;
1119 sk->prot->retransmits++;
1120 tcp_write_timeout(sk);
1121 break;
1122 default:
1123 printk ("rexmit_timer: timer expired - reason unknown\n");
1124 break;
1125 }1126 release_sock(sk);
1127 }1128
1129 /*1130 * This routine is called by the ICMP module when it gets some1131 * sort of error condition. If err < 0 then the socket should1132 * be closed and the error returned to the user. If err > 01133 * it's just the icmp type << 8 | icmp code. After adjustment1134 * header points to the first 8 bytes of the tcp header. We need1135 * to find the appropriate port.1136 */1137
1138 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */1139 __u32saddr, structinet_protocol *protocol)
1140 {1141 structtcphdr *th = (structtcphdr *)header;
1142 structsock *sk;
1143
1144 /*1145 * This one is _WRONG_. FIXME urgently.1146 */1147 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1148 structiphdr *iph=(structiphdr *)(header-sizeof(structiphdr));
1149 #endif1150 th =(structtcphdr *)header;
1151 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1152
1153 if (sk == NULL)
1154 return;
1155
1156 if (type == ICMP_SOURCE_QUENCH)
1157 {1158 /*1159 * FIXME:1160 * For now we will just trigger a linear backoff.1161 * The slow start code should cause a real backoff here.1162 */1163 if (sk->cong_window > 4)
1164 sk->cong_window--;
1165 return;
1166 }1167
1168 if (type == ICMP_PARAMETERPROB)
1169 {1170 sk->err=EPROTO;
1171 sk->error_report(sk);
1172 }1173
1174 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1175 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1176 {1177 structrtable * rt;
1178 /*1179 * Ugly trick to pass MTU to protocol layer.1180 * Really we should add argument "info" to error handler.1181 */1182 unsignedshortnew_mtu = ntohs(iph->id);
1183
1184 if ((rt = sk->ip_route_cache) != NULL)
1185 if (rt->rt_mtu > new_mtu)
1186 rt->rt_mtu = new_mtu;
1187
1188 if (sk->mtu > new_mtu - sizeof(structiphdr) - sizeof(structtcphdr)
1189 && new_mtu > sizeof(structiphdr)+sizeof(structtcphdr))
1190 sk->mtu = new_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
1191
1192 return;
1193 }1194 #endif1195
1196 /*1197 * If we've already connected we will keep trying1198 * until we time out, or the user gives up.1199 */1200
1201 if (code < 13)
1202 {1203 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1204 {1205 sk->err = icmp_err_convert[code].errno;
1206 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1207 {1208 tcp_statistics.TcpAttemptFails++;
1209 tcp_set_state(sk,TCP_CLOSE);
1210 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */1211 }1212 }1213 else/* Only an error on timeout */1214 sk->err_soft = icmp_err_convert[code].errno;
1215 }1216 }1217
1218
1219 /*1220 * Walk down the receive queue counting readable data until we hit the end or we find a gap1221 * in the received data queue (ie a frame missing that needs sending to us). Not1222 * sorting using two queues as data arrives makes life so much harder.1223 */1224
1225 staticinttcp_readable(structsock *sk)
/* */1226 {1227 unsignedlongcounted;
1228 unsignedlongamount;
1229 structsk_buff *skb;
1230 intsum;
1231 unsignedlongflags;
1232
1233 if(sk && sk->debug)
1234 printk("tcp_readable: %p - ",sk);
1235
1236 save_flags(flags);
1237 cli();
1238 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1239 {1240 restore_flags(flags);
1241 if(sk && sk->debug)
1242 printk("empty\n");
1243 return(0);
1244 }1245
1246 counted = sk->copied_seq; /* Where we are at the moment */1247 amount = 0;
1248
1249 /* 1250 * Do until a push or until we are out of data. 1251 */1252
1253 do1254 {1255 if (before(counted, skb->seq)) /* Found a hole so stops here */1256 break;
1257 sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */1258 if (skb->h.th->syn)
1259 sum++;
1260 if (sum > 0)
1261 {/* Add it up, move on */1262 amount += sum;
1263 if (skb->h.th->syn)
1264 amount--;
1265 counted += sum;
1266 }1267 /*1268 * Don't count urg data ... but do it in the right place!1269 * Consider: "old_data (ptr is here) URG PUSH data"1270 * The old code would stop at the first push because1271 * it counted the urg (amount==1) and then does amount--1272 * *after* the loop. This means tcp_readable() always1273 * returned zero if any URG PUSH was in the queue, even1274 * though there was normal data available. If we subtract1275 * the urg data right here, we even get it to work for more1276 * than one URG PUSH skb without normal data.1277 * This means that select() finally works now with urg data1278 * in the queue. Note that rlogin was never affected1279 * because it doesn't use select(); it uses two processes1280 * and a blocking read(). And the queue scan in tcp_read()1281 * was correct. Mike <pall@rz.uni-karlsruhe.de>1282 */1283 if (skb->h.th->urg)
1284 amount--; /* don't count urg data */1285 if (amount && skb->h.th->psh) break;
1286 skb = skb->next;
1287 }1288 while(skb != (structsk_buff *)&sk->receive_queue);
1289
1290 restore_flags(flags);
1291 if(sk->debug)
1292 printk("got %lu bytes.\n",amount);
1293 return(amount);
1294 }1295
1296 /*1297 * LISTEN is a special case for select..1298 */1299 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */1300 {1301 if (sel_type == SEL_IN) {1302 intretval;
1303
1304 sk->inuse = 1;
1305 retval = (tcp_find_established(sk) != NULL);
1306 release_sock(sk);
1307 if (!retval)
1308 select_wait(&master_select_wakeup,wait);
1309 returnretval;
1310 }1311 return 0;
1312 }1313
1314
1315 /*1316 * Wait for a TCP event.1317 *1318 * Note that we don't need to set "sk->inuse", as the upper select layers1319 * take care of normal races (between the test and the event) and we don't1320 * go look at any of the socket buffers directly.1321 */1322 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */1323 {1324 if (sk->state == TCP_LISTEN)
1325 returntcp_listen_select(sk, sel_type, wait);
1326
1327 switch(sel_type) {1328 caseSEL_IN:
1329 if (sk->err)
1330 return 1;
1331 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1332 break;
1333
1334 if (sk->shutdown & RCV_SHUTDOWN)
1335 return 1;
1336
1337 if (sk->acked_seq == sk->copied_seq)
1338 break;
1339
1340 if (sk->urg_seq != sk->copied_seq ||
1341 sk->acked_seq != sk->copied_seq+1 ||
1342 sk->urginline || !sk->urg_data)
1343 return 1;
1344 break;
1345
1346 caseSEL_OUT:
1347 if (sk->err)
1348 return 1;
1349 if (sk->shutdown & SEND_SHUTDOWN)
1350 return 0;
1351 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1352 break;
1353 /*1354 * This is now right thanks to a small fix1355 * by Matt Dillon.1356 */1357
1358 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1359 break;
1360 return 1;
1361
1362 caseSEL_EX:
1363 if (sk->urg_data)
1364 return 1;
1365 break;
1366 }1367 select_wait(sk->sleep, wait);
1368 return 0;
1369 }1370
1371 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */1372 {1373 interr;
1374 switch(cmd)
1375 {1376
1377 caseTIOCINQ:
1378 #ifdef FIXME /* FIXME: */1379 caseFIONREAD:
1380 #endif1381 {1382 unsignedlongamount;
1383
1384 if (sk->state == TCP_LISTEN)
1385 return(-EINVAL);
1386
1387 sk->inuse = 1;
1388 amount = tcp_readable(sk);
1389 release_sock(sk);
1390 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1391 if(err)
1392 returnerr;
1393 put_user(amount, (int *)arg);
1394 return(0);
1395 }1396 caseSIOCATMARK:
1397 {1398 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1399
1400 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1401 if (err)
1402 returnerr;
1403 put_user(answ,(int *) arg);
1404 return(0);
1405 }1406 caseTIOCOUTQ:
1407 {1408 unsignedlongamount;
1409
1410 if (sk->state == TCP_LISTEN) return(-EINVAL);
1411 amount = sock_wspace(sk);
1412 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1413 if(err)
1414 returnerr;
1415 put_user(amount, (int *)arg);
1416 return(0);
1417 }1418 default:
1419 return(-EINVAL);
1420 }1421 }1422
1423
1424 /*1425 * This routine computes a TCP checksum. 1426 *1427 * Modified January 1995 from a go-faster DOS routine by1428 * Jorge Cwik <jorge@laser.satlink.net>1429 */1430
1431 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1432 unsignedlongsaddr, unsignedlongdaddr, unsignedlongbase)
1433 {1434 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1435 }1436
1437 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1438 unsignedlongdaddr, intlen, structsock *sk)
1439 {1440 th->check = 0;
1441 th->check = tcp_check(th, len, saddr, daddr,
1442 csum_partial((char *)th,len,0));
1443 return;
1444 }1445
1446 /*1447 * This is the main buffer sending routine. We queue the buffer1448 * having checked it is sane seeming.1449 */1450
1451 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1452 {1453 intsize;
1454 structtcphdr * th = skb->h.th;
1455
1456 /*1457 * length of packet (not counting length of pre-tcp headers) 1458 */1459
1460 size = skb->len - ((unsignedchar *) th - skb->data);
1461
1462 /*1463 * Sanity check it.. 1464 */1465
1466 if (size < sizeof(structtcphdr) || size > skb->len)
1467 {1468 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1469 skb, skb->data, th, skb->len);
1470 kfree_skb(skb, FREE_WRITE);
1471 return;
1472 }1473
1474 /*1475 * If we have queued a header size packet.. (these crash a few1476 * tcp stacks if ack is not set)1477 */1478
1479 if (size == sizeof(structtcphdr))
1480 {1481 /* If it's got a syn or fin it's notionally included in the size..*/1482 if(!th->syn && !th->fin)
1483 {1484 printk("tcp_send_skb: attempt to queue a bogon.\n");
1485 kfree_skb(skb,FREE_WRITE);
1486 return;
1487 }1488 }1489
1490 /*1491 * Actual processing.1492 */1493
1494 tcp_statistics.TcpOutSegs++;
1495 skb->seq = ntohl(th->seq);
1496 skb->end_seq = skb->seq + size - 4*th->doff;
1497
1498 /*1499 * We must queue if1500 *1501 * a) The right edge of this frame exceeds the window1502 * b) We are retransmitting (Nagle's rule)1503 * c) We have too many packets 'in flight'1504 */1505
1506 if (after(skb->end_seq, sk->window_seq) ||
1507 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1508 sk->packets_out >= sk->cong_window)
1509 {1510 /* checksum will be supplied by tcp_write_xmit. So1511 * we shouldn't need to set it at all. I'm being paranoid */1512 th->check = 0;
1513 if (skb->next != NULL)
1514 {1515 printk("tcp_send_partial: next != NULL\n");
1516 skb_unlink(skb);
1517 }1518 skb_queue_tail(&sk->write_queue, skb);
1519
1520 /*1521 * If we don't fit we have to start the zero window1522 * probes. This is broken - we really need to do a partial1523 * send _first_ (This is what causes the Cisco and PC/TCP1524 * grief).1525 */1526
1527 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1528 sk->send_head == NULL && sk->ack_backlog == 0)
1529 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1530 }1531 else1532 {1533 /*1534 * This is going straight out1535 */1536
1537 th->ack_seq = htonl(sk->acked_seq);
1538 th->window = htons(tcp_select_window(sk));
1539
1540 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1541
1542 sk->sent_seq = sk->write_seq;
1543
1544 /*1545 * This is mad. The tcp retransmit queue is put together1546 * by the ip layer. This causes half the problems with1547 * unroutable FIN's and other things.1548 */1549
1550 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1551
1552
1553 sk->ack_backlog = 0;
1554 sk->bytes_rcv = 0;
1555
1556 /*1557 * Set for next retransmit based on expected ACK time.1558 * FIXME: We set this every time which means our 1559 * retransmits are really about a window behind.1560 */1561
1562 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1563 }1564 }1565
1566 /*1567 * Locking problems lead us to a messy situation where we can have1568 * multiple partially complete buffers queued up. This is really bad1569 * as we don't want to be sending partial buffers. Fix this with1570 * a semaphore or similar to lock tcp_write per socket.1571 *1572 * These routines are pretty self descriptive.1573 */1574
1575 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1576 {1577 structsk_buff * skb;
1578 unsignedlongflags;
1579
1580 save_flags(flags);
1581 cli();
1582 skb = sk->partial;
1583 if (skb) {1584 sk->partial = NULL;
1585 del_timer(&sk->partial_timer);
1586 }1587 restore_flags(flags);
1588 returnskb;
1589 }1590
1591 /*1592 * Empty the partial queue1593 */1594
1595 staticvoidtcp_send_partial(structsock *sk)
/* */1596 {1597 structsk_buff *skb;
1598
1599 if (sk == NULL)
1600 return;
1601 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1602 tcp_send_skb(sk, skb);
1603 }1604
1605 /*1606 * Queue a partial frame1607 */1608
1609 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1610 {1611 structsk_buff * tmp;
1612 unsignedlongflags;
1613
1614 save_flags(flags);
1615 cli();
1616 tmp = sk->partial;
1617 if (tmp)
1618 del_timer(&sk->partial_timer);
1619 sk->partial = skb;
1620 init_timer(&sk->partial_timer);
1621 /*1622 * Wait up to 1 second for the buffer to fill.1623 */1624 sk->partial_timer.expires = jiffies+HZ;
1625 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1626 sk->partial_timer.data = (unsignedlong) sk;
1627 add_timer(&sk->partial_timer);
1628 restore_flags(flags);
1629 if (tmp)
1630 tcp_send_skb(sk, tmp);
1631 }1632
1633
1634
1635 /*1636 * This routine sends an ack and also updates the window. 1637 */1638
1639 staticvoidtcp_send_ack(u32sequence, u32ack,
/* */1640 structsock *sk,
1641 structtcphdr *th, unsignedlongdaddr)
1642 {1643 structsk_buff *buff;
1644 structtcphdr *t1;
1645 structdevice *dev = NULL;
1646 inttmp;
1647
1648 if(sk->zapped)
1649 return; /* We have been reset, we may not send again */1650
1651 /*1652 * We need to grab some memory, and put together an ack,1653 * and then put it into the queue to be sent.1654 */1655
1656 buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1657 if (buff == NULL)
1658 {1659 /* 1660 * Force it to send an ack. We don't have to do this1661 * (ACK is unreliable) but it's much better use of 1662 * bandwidth on slow links to send a spare ack than1663 * resend packets. 1664 */1665
1666 sk->ack_backlog++;
1667 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1668 {1669 reset_xmit_timer(sk, TIME_WRITE, HZ);
1670 }1671 return;
1672 }1673
1674 /*1675 * Assemble a suitable TCP frame1676 */1677
1678 buff->sk = sk;
1679 buff->localroute = sk->localroute;
1680
1681 /* 1682 * Put in the IP header and routing stuff. 1683 */1684
1685 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1686 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1687 if (tmp < 0)
1688 {1689 buff->free = 1;
1690 sock_wfree(sk, buff);
1691 return;
1692 }1693 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1694
1695 memcpy(t1, th, sizeof(*t1));
1696
1697 /*1698 * Swap the send and the receive. 1699 */1700
1701 t1->dest = th->source;
1702 t1->source = th->dest;
1703 t1->seq = ntohl(sequence);
1704 t1->ack = 1;
1705 sk->window = tcp_select_window(sk);
1706 t1->window = ntohs(sk->window);
1707 t1->res1 = 0;
1708 t1->res2 = 0;
1709 t1->rst = 0;
1710 t1->urg = 0;
1711 t1->syn = 0;
1712 t1->psh = 0;
1713 t1->fin = 0;
1714
1715 /*1716 * If we have nothing queued for transmit and the transmit timer1717 * is on we are just doing an ACK timeout and need to switch1718 * to a keepalive.1719 */1720
1721 if (ack == sk->acked_seq) {1722 sk->ack_backlog = 0;
1723 sk->bytes_rcv = 0;
1724 sk->ack_timed = 0;
1725
1726 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1727 && sk->ip_xmit_timeout == TIME_WRITE)
1728 if(sk->keepopen)
1729 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1730 else1731 delete_timer(sk);
1732 }1733
1734 /*1735 * Fill in the packet and send it1736 */1737
1738 t1->ack_seq = htonl(ack);
1739 t1->doff = sizeof(*t1)/4;
1740 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1741 if (sk->debug)
1742 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1743 tcp_statistics.TcpOutSegs++;
1744 sk->prot->queue_xmit(sk, dev, buff, 1);
1745 }1746
1747
1748 /* 1749 * This routine builds a generic TCP header. 1750 */1751
1752 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1753 {1754
1755 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1756 th->seq = htonl(sk->write_seq);
1757 th->psh =(push == 0) ? 1 : 0;
1758 th->doff = sizeof(*th)/4;
1759 th->ack = 1;
1760 th->fin = 0;
1761 sk->ack_backlog = 0;
1762 sk->bytes_rcv = 0;
1763 sk->ack_timed = 0;
1764 th->ack_seq = htonl(sk->acked_seq);
1765 sk->window = tcp_select_window(sk);
1766 th->window = htons(sk->window);
1767
1768 return(sizeof(*th));
1769 }1770
1771 /*1772 * This routine copies from a user buffer into a socket,1773 * and starts the transmit system.1774 */1775
1776 staticinttcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */1777 intlen, intnonblock, intflags)
1778 {1779 intcopied = 0;
1780 intcopy;
1781 inttmp;
1782 intseglen;
1783 intiovct=0;
1784 structsk_buff *skb;
1785 structsk_buff *send_tmp;
1786 structproto *prot;
1787 structdevice *dev = NULL;
1788 unsignedchar *from;
1789
1790 /*1791 * Do sanity checking for sendmsg/sendto/send1792 */1793
1794 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1795 return -EINVAL;
1796 if (msg->msg_name)
1797 {1798 structsockaddr_in *addr=(structsockaddr_in *)msg->msg_name;
1799 if(sk->state == TCP_CLOSE)
1800 return -ENOTCONN;
1801 if (msg->msg_namelen < sizeof(*addr))
1802 return -EINVAL;
1803 if (addr->sin_family && addr->sin_family != AF_INET)
1804 return -EINVAL;
1805 if (addr->sin_port != sk->dummy_th.dest)
1806 return -EISCONN;
1807 if (addr->sin_addr.s_addr != sk->daddr)
1808 return -EISCONN;
1809 }1810
1811 /*1812 * Ok commence sending1813 */1814
1815 while(iovct<msg->msg_iovlen)
1816 {1817 seglen=msg->msg_iov[iovct].iov_len;
1818 from=msg->msg_iov[iovct++].iov_base;
1819 sk->inuse=1;
1820 prot = sk->prot;
1821 while(seglen > 0)
1822 {1823 if (sk->err)
1824 {/* Stop on an error */1825 release_sock(sk);
1826 if (copied)
1827 return(copied);
1828 returnsock_error(sk);
1829 }1830
1831 /*1832 * First thing we do is make sure that we are established. 1833 */1834
1835 if (sk->shutdown & SEND_SHUTDOWN)
1836 {1837 release_sock(sk);
1838 sk->err = EPIPE;
1839 if (copied)
1840 return(copied);
1841 sk->err = 0;
1842 return(-EPIPE);
1843 }1844
1845 /* 1846 * Wait for a connection to finish.1847 */1848
1849 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1850 {1851 if (sk->err)
1852 {1853 release_sock(sk);
1854 if (copied)
1855 return(copied);
1856 returnsock_error(sk);
1857 }1858
1859 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1860 {1861 release_sock(sk);
1862 if (copied)
1863 return(copied);
1864
1865 if (sk->err)
1866 returnsock_error(sk);
1867
1868 if (sk->keepopen)
1869 {1870 send_sig(SIGPIPE, current, 0);
1871 }1872 return(-EPIPE);
1873 }1874
1875 if (nonblock || copied)
1876 {1877 release_sock(sk);
1878 if (copied)
1879 return(copied);
1880 return(-EAGAIN);
1881 }1882
1883 release_sock(sk);
1884 cli();
1885
1886 if (sk->state != TCP_ESTABLISHED &&
1887 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1888 {1889 interruptible_sleep_on(sk->sleep);
1890 if (current->signal & ~current->blocked)
1891 {1892 sti();
1893 if (copied)
1894 return(copied);
1895 return(-ERESTARTSYS);
1896 }1897 }1898 sk->inuse = 1;
1899 sti();
1900 }1901
1902 /*1903 * The following code can result in copy <= if sk->mss is ever1904 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1905 * sk->mtu is constant once SYN processing is finished. I.e. we1906 * had better not get here until we've seen his SYN and at least one1907 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1908 * But ESTABLISHED should guarantee that. sk->max_window is by definition1909 * non-decreasing. Note that any ioctl to set user_mss must be done1910 * before the exchange of SYN's. If the initial ack from the other1911 * end has a window of 0, max_window and thus mss will both be 0.1912 */1913
1914 /* 1915 * Now we need to check if we have a half built packet. 1916 */1917 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1918 /*1919 * FIXME: I'm almost sure that this fragment is BUG,1920 * but it works... I do not know why 8) --ANK1921 *1922 * Really, we should rebuild all the queues...1923 * It's difficult. Temprorary hack is to send all1924 * queued segments with allowed fragmentation.1925 */1926 {1927 intnew_mss = min(sk->mtu, sk->max_window);
1928 if (new_mss < sk->mss)
1929 {1930 tcp_send_partial(sk);
1931 sk->mss = new_mss;
1932 }1933 }1934 #endif1935
1936 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1937 {1938 inthdrlen;
1939
1940 /* IP header + TCP header */1941 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1942 + sizeof(structtcphdr);
1943
1944 /* Add more stuff to the end of skb->len */1945 if (!(flags & MSG_OOB))
1946 {1947 copy = min(sk->mss - (skb->len - hdrlen), seglen);
1948 if (copy <= 0)
1949 {1950 printk("TCP: **bug**: \"copy\" <= 0\n");
1951 return -EFAULT;
1952 }1953 memcpy_fromfs(skb_put(skb,copy), from, copy);
1954 from += copy;
1955 copied += copy;
1956 len -= copy;
1957 sk->write_seq += copy;
1958 seglen -= copy;
1959 }1960 if ((skb->len - hdrlen) >= sk->mss ||
1961 (flags & MSG_OOB) || !sk->packets_out)
1962 tcp_send_skb(sk, skb);
1963 else1964 tcp_enqueue_partial(skb, sk);
1965 continue;
1966 }1967
1968 /*1969 * We also need to worry about the window.1970 * If window < 1/2 the maximum window we've seen from this1971 * host, don't use it. This is sender side1972 * silly window prevention, as specified in RFC1122.1973 * (Note that this is different than earlier versions of1974 * SWS prevention, e.g. RFC813.). What we actually do is 1975 * use the whole MSS. Since the results in the right1976 * edge of the packet being outside the window, it will1977 * be queued for later rather than sent.1978 */1979
1980 copy = sk->window_seq - sk->write_seq;
1981 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1982 copy = sk->mss;
1983 if (copy > seglen)
1984 copy = seglen;
1985
1986 /*1987 * We should really check the window here also. 1988 */1989
1990 send_tmp = NULL;
1991 if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out)
1992 {1993 /*1994 * We will release the socket in case we sleep here. 1995 */1996 release_sock(sk);
1997 /*1998 * NB: following must be mtu, because mss can be increased.1999 * mss is always <= mtu 2000 */2001 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
2002 sk->inuse = 1;
2003 send_tmp = skb;
2004 }2005 else2006 {2007 /*2008 * We will release the socket in case we sleep here. 2009 */2010 release_sock(sk);
2011 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
2012 sk->inuse = 1;
2013 }2014
2015 /*2016 * If we didn't get any memory, we need to sleep. 2017 */2018
2019 if (skb == NULL)
2020 {2021 sk->socket->flags |= SO_NOSPACE;
2022 if (nonblock)
2023 {2024 release_sock(sk);
2025 if (copied)
2026 return(copied);
2027 return(-EAGAIN);
2028 }2029
2030 /*2031 * FIXME: here is another race condition. 2032 */2033
2034 tmp = sk->wmem_alloc;
2035 release_sock(sk);
2036 cli();
2037 /*2038 * Again we will try to avoid it. 2039 */2040 if (tmp <= sk->wmem_alloc &&
2041 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
2042 && sk->err == 0)
2043 {2044 sk->socket->flags &= ~SO_NOSPACE;
2045 interruptible_sleep_on(sk->sleep);
2046 if (current->signal & ~current->blocked)
2047 {2048 sti();
2049 if (copied)
2050 return(copied);
2051 return(-ERESTARTSYS);
2052 }2053 }2054 sk->inuse = 1;
2055 sti();
2056 continue;
2057 }2058
2059 skb->sk = sk;
2060 skb->free = 0;
2061 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
2062
2063 /*2064 * FIXME: we need to optimize this.2065 * Perhaps some hints here would be good.2066 */2067
2068 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
2069 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2070 if (tmp < 0 )
2071 {2072 sock_wfree(sk, skb);
2073 release_sock(sk);
2074 if (copied)
2075 return(copied);
2076 return(tmp);
2077 }2078 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY2079 skb->ip_hdr->frag_off |= htons(IP_DF);
2080 #endif2081 skb->dev = dev;
2082 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
2083 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
2084 if (tmp < 0)
2085 {2086 sock_wfree(sk, skb);
2087 release_sock(sk);
2088 if (copied)
2089 return(copied);
2090 return(tmp);
2091 }2092
2093 if (flags & MSG_OOB)
2094 {2095 skb->h.th->urg = 1;
2096 skb->h.th->urg_ptr = ntohs(copy);
2097 }2098
2099 memcpy_fromfs(skb_put(skb,copy), from, copy);
2100
2101 from += copy;
2102 copied += copy;
2103 len -= copy;
2104 seglen -= copy;
2105 skb->free = 0;
2106 sk->write_seq += copy;
2107
2108 if (send_tmp != NULL)
2109 {2110 tcp_enqueue_partial(send_tmp, sk);
2111 continue;
2112 }2113 tcp_send_skb(sk, skb);
2114 }2115 }2116 sk->err = 0;
2117
2118 /*2119 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly2120 * interactive fast network servers. It's meant to be on and2121 * it really improves the throughput though not the echo time2122 * on my slow slip link - Alan2123 */2124
2125 /*2126 * Avoid possible race on send_tmp - c/o Johannes Stille 2127 */2128
2129 if(sk->partial && ((!sk->packets_out)
2130 /* If not nagling we can send on the before case too.. */2131 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2132 ))
2133 tcp_send_partial(sk);
2134
2135 release_sock(sk);
2136 return(copied);
2137 }2138
2139 /*2140 * Send an ack if one is backlogged at this point. Ought to merge2141 * this with tcp_send_ack().2142 * This is called for delayed acks also.2143 */2144
2145 staticvoidtcp_read_wakeup(structsock *sk)
/* */2146 {2147 inttmp;
2148 structdevice *dev = NULL;
2149 structtcphdr *t1;
2150 structsk_buff *buff;
2151
2152 if (!sk->ack_backlog)
2153 return;
2154
2155 /*2156 * If we're closed, don't send an ack, or we'll get a RST2157 * from the closed destination.2158 */2159 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2160 return;
2161
2162 /*2163 * FIXME: we need to put code here to prevent this routine from2164 * being called. Being called once in a while is ok, so only check2165 * if this is the second time in a row.2166 */2167
2168 /*2169 * We need to grab some memory, and put together an ack,2170 * and then put it into the queue to be sent.2171 */2172
2173 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2174 if (buff == NULL)
2175 {2176 /* Try again real soon. */2177 reset_xmit_timer(sk, TIME_WRITE, HZ);
2178 return;
2179 }2180
2181 buff->sk = sk;
2182 buff->localroute = sk->localroute;
2183
2184 /*2185 * Put in the IP header and routing stuff. 2186 */2187
2188 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2189 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2190 if (tmp < 0)
2191 {2192 buff->free = 1;
2193 sock_wfree(sk, buff);
2194 return;
2195 }2196
2197 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2198
2199 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2200 t1->seq = htonl(sk->sent_seq);
2201 t1->ack = 1;
2202 t1->res1 = 0;
2203 t1->res2 = 0;
2204 t1->rst = 0;
2205 t1->urg = 0;
2206 t1->syn = 0;
2207 t1->psh = 0;
2208
2209
2210 sk->ack_backlog = 0;
2211 sk->bytes_rcv = 0;
2212
2213 sk->window = tcp_select_window(sk);
2214 t1->window = htons(sk->window);
2215 t1->ack_seq = htonl(sk->acked_seq);
2216 t1->doff = sizeof(*t1)/4;
2217 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2218 sk->prot->queue_xmit(sk, dev, buff, 1);
2219 tcp_statistics.TcpOutSegs++;
2220 }2221
2222
2223 /*2224 * FIXME:2225 * This routine frees used buffers.2226 * It should consider sending an ACK to let the2227 * other end know we now have a bigger window.2228 */2229
2230 staticvoidcleanup_rbuf(structsock *sk)
/* */2231 {2232 unsignedlongflags;
2233 structsk_buff *skb;
2234 unsignedlongrspace;
2235
2236 save_flags(flags);
2237 cli();
2238
2239 /*2240 * See if we have anything to free up?2241 */2242
2243 skb = skb_peek(&sk->receive_queue);
2244 if (!skb || !skb->used || skb->users) {2245 restore_flags(flags);
2246 return;
2247 }2248
2249 /*2250 * We have to loop through all the buffer headers,2251 * and try to free up all the space we can.2252 */2253
2254 do{2255 skb_unlink(skb);
2256 skb->sk = sk;
2257 kfree_skb(skb, FREE_READ);
2258 skb = skb_peek(&sk->receive_queue);
2259 }while (skb && skb->used && !skb->users);
2260 restore_flags(flags);
2261
2262 /*2263 * FIXME:2264 * At this point we should send an ack if the difference2265 * in the window, and the amount of space is bigger than2266 * TCP_WINDOW_DIFF.2267 */2268
2269 rspace=sock_rspace(sk);
2270 if(sk->debug)
2271 printk("sk->rspace = %lu\n", rspace);
2272 /*2273 * This area has caused the most trouble. The current strategy2274 * is to simply do nothing if the other end has room to send at2275 * least 3 full packets, because the ack from those will auto-2276 * matically update the window. If the other end doesn't think2277 * we have much space left, but we have room for at least 1 more2278 * complete packet than it thinks we do, we will send an ack2279 * immediately. Otherwise we will wait up to .5 seconds in case2280 * the user reads some more.2281 */2282 sk->ack_backlog++;
2283
2284 /*2285 * It's unclear whether to use sk->mtu or sk->mss here. They differ only2286 * if the other end is offering a window smaller than the agreed on MSS2287 * (called sk->mtu here). In theory there's no connection between send2288 * and receive, and so no reason to think that they're going to send2289 * small packets. For the moment I'm using the hack of reducing the mss2290 * only on the send side, so I'm putting mtu here.2291 */2292
2293 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2294 {2295 /* Send an ack right now. */2296 tcp_read_wakeup(sk);
2297 }2298 else2299 {2300 /* Force it to send an ack soon. */2301 intwas_active = del_timer(&sk->retransmit_timer);
2302 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2303 {2304 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2305 }2306 else2307 add_timer(&sk->retransmit_timer);
2308 }2309 }2310
2311
2312 /*2313 * Handle reading urgent data. BSD has very simple semantics for2314 * this, no blocking and very strange errors 8)2315 */2316
2317 staticinttcp_recv_urg(structsock * sk, intnonblock,
/* */2318 structmsghdr *msg, intlen, intflags, int *addr_len)
2319 {2320 /*2321 * No URG data to read2322 */2323 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2324 return -EINVAL; /* Yes this is right ! */2325
2326 if (sk->err)
2327 returnsock_error(sk);
2328
2329 if (sk->state == TCP_CLOSE || sk->done)
2330 {2331 if (!sk->done)
2332 {2333 sk->done = 1;
2334 return 0;
2335 }2336 return -ENOTCONN;
2337 }2338
2339 if (sk->shutdown & RCV_SHUTDOWN)
2340 {2341 sk->done = 1;
2342 return 0;
2343 }2344 sk->inuse = 1;
2345 if (sk->urg_data & URG_VALID)
2346 {2347 charc = sk->urg_data;
2348 if (!(flags & MSG_PEEK))
2349 sk->urg_data = URG_READ;
2350 memcpy_toiovec(msg->msg_iov, &c, 1);
2351 if(msg->msg_name)
2352 {2353 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2354 sin->sin_family=AF_INET;
2355 sin->sin_addr.s_addr=sk->daddr;
2356 sin->sin_port=sk->dummy_th.dest;
2357 }2358 if(addr_len)
2359 *addr_len=sizeof(structsockaddr_in);
2360 release_sock(sk);
2361 return 1;
2362 }2363 release_sock(sk);
2364
2365 /*2366 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2367 * the available implementations agree in this case:2368 * this call should never block, independent of the2369 * blocking state of the socket.2370 * Mike <pall@rz.uni-karlsruhe.de>2371 */2372 return -EAGAIN;
2373 }2374
2375
2376 /*2377 * This routine copies from a sock struct into the user buffer. 2378 */2379
2380 staticinttcp_recvmsg(structsock *sk, structmsghdr *msg,
/* */2381 intlen, intnonblock, intflags, int *addr_len)
2382 {2383 structwait_queuewait = {current, NULL};
2384 intcopied = 0;
2385 u32peek_seq;
2386 volatileu32 *seq; /* So gcc doesn't overoptimise */2387 unsignedlongused;
2388
2389 /* 2390 * This error should be checked. 2391 */2392
2393 if (sk->state == TCP_LISTEN)
2394 return -ENOTCONN;
2395
2396 /*2397 * Urgent data needs to be handled specially. 2398 */2399
2400 if (flags & MSG_OOB)
2401 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2402
2403 /*2404 * Copying sequence to update. This is volatile to handle2405 * the multi-reader case neatly (memcpy_to/fromfs might be 2406 * inline and thus not flush cached variables otherwise).2407 */2408
2409 peek_seq = sk->copied_seq;
2410 seq = &sk->copied_seq;
2411 if (flags & MSG_PEEK)
2412 seq = &peek_seq;
2413
2414 add_wait_queue(sk->sleep, &wait);
2415 sk->inuse = 1;
2416 while (len > 0)
2417 {2418 structsk_buff * skb;
2419 u32offset;
2420
2421 /*2422 * Are we at urgent data? Stop if we have read anything.2423 */2424
2425 if (copied && sk->urg_data && sk->urg_seq == *seq)
2426 break;
2427
2428 /*2429 * Next get a buffer.2430 */2431
2432 current->state = TASK_INTERRUPTIBLE;
2433
2434 skb = skb_peek(&sk->receive_queue);
2435 do2436 {2437 if (!skb)
2438 break;
2439 if (before(*seq, skb->seq))
2440 break;
2441 offset = *seq - skb->seq;
2442 if (skb->h.th->syn)
2443 offset--;
2444 if (offset < skb->len)
2445 gotofound_ok_skb;
2446 if (skb->h.th->fin)
2447 gotofound_fin_ok;
2448 if (!(flags & MSG_PEEK))
2449 skb->used = 1;
2450 skb = skb->next;
2451 }2452 while (skb != (structsk_buff *)&sk->receive_queue);
2453
2454 if (copied)
2455 break;
2456
2457 if (sk->err)
2458 {2459 copied = sock_error(sk);
2460 break;
2461 }2462
2463 if (sk->state == TCP_CLOSE)
2464 {2465 if (!sk->done)
2466 {2467 sk->done = 1;
2468 break;
2469 }2470 copied = -ENOTCONN;
2471 break;
2472 }2473
2474 if (sk->shutdown & RCV_SHUTDOWN)
2475 {2476 sk->done = 1;
2477 break;
2478 }2479
2480 if (nonblock)
2481 {2482 copied = -EAGAIN;
2483 break;
2484 }2485
2486 cleanup_rbuf(sk);
2487 release_sock(sk);
2488 sk->socket->flags |= SO_WAITDATA;
2489 schedule();
2490 sk->socket->flags &= ~SO_WAITDATA;
2491 sk->inuse = 1;
2492
2493 if (current->signal & ~current->blocked)
2494 {2495 copied = -ERESTARTSYS;
2496 break;
2497 }2498 continue;
2499
2500 found_ok_skb:
2501 /*2502 * Lock the buffer. We can be fairly relaxed as2503 * an interrupt will never steal a buffer we are 2504 * using unless I've missed something serious in2505 * tcp_data.2506 */2507
2508 skb->users++;
2509
2510 /*2511 * Ok so how much can we use ? 2512 */2513
2514 used = skb->len - offset;
2515 if (len < used)
2516 used = len;
2517 /*2518 * Do we have urgent data here? 2519 */2520
2521 if (sk->urg_data)
2522 {2523 u32urg_offset = sk->urg_seq - *seq;
2524 if (urg_offset < used)
2525 {2526 if (!urg_offset)
2527 {2528 if (!sk->urginline)
2529 {2530 ++*seq;
2531 offset++;
2532 used--;
2533 }2534 }2535 else2536 used = urg_offset;
2537 }2538 }2539
2540 /*2541 * Copy it - We _MUST_ update *seq first so that we2542 * don't ever double read when we have dual readers2543 */2544
2545 *seq += used;
2546
2547 /*2548 * This memcpy_tofs can sleep. If it sleeps and we2549 * do a second read it relies on the skb->users to avoid2550 * a crash when cleanup_rbuf() gets called.2551 */2552
2553 memcpy_toiovec(msg->msg_iov,((unsignedchar *)skb->h.th) +
2554 skb->h.th->doff*4 + offset, used);
2555 copied += used;
2556 len -= used;
2557
2558 /*2559 * We now will not sleep again until we are finished2560 * with skb. Sorry if you are doing the SMP port2561 * but you'll just have to fix it neatly ;)2562 */2563
2564 skb->users --;
2565
2566 if (after(sk->copied_seq,sk->urg_seq))
2567 sk->urg_data = 0;
2568 if (used + offset < skb->len)
2569 continue;
2570
2571 /*2572 * Process the FIN.2573 */2574
2575 if (skb->h.th->fin)
2576 gotofound_fin_ok;
2577 if (flags & MSG_PEEK)
2578 continue;
2579 skb->used = 1;
2580 continue;
2581
2582 found_fin_ok:
2583 ++*seq;
2584 if (flags & MSG_PEEK)
2585 break;
2586
2587 /*2588 * All is done2589 */2590
2591 skb->used = 1;
2592 sk->shutdown |= RCV_SHUTDOWN;
2593 break;
2594
2595 }2596
2597 if(copied>0 && msg->msg_name)
2598 {2599 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2600 sin->sin_family=AF_INET;
2601 sin->sin_addr.s_addr=sk->daddr;
2602 sin->sin_port=sk->dummy_th.dest;
2603 }2604 if(addr_len)
2605 *addr_len=sizeof(structsockaddr_in);
2606
2607 remove_wait_queue(sk->sleep, &wait);
2608 current->state = TASK_RUNNING;
2609
2610 /* Clean up data we have read: This will do ACK frames */2611 cleanup_rbuf(sk);
2612 release_sock(sk);
2613 returncopied;
2614 }2615
2616
2617
2618 /*2619 * State processing on a close. This implements the state shift for2620 * sending our FIN frame. Note that we only send a FIN for some 2621 * states. A shutdown() may have already sent the FIN, or we may be2622 * closed.2623 */2624
2625 staticinttcp_close_state(structsock *sk, intdead)
/* */2626 {2627 intns=TCP_CLOSE;
2628 intsend_fin=0;
2629 switch(sk->state)
2630 {2631 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2632 break;
2633 caseTCP_SYN_RECV:
2634 caseTCP_ESTABLISHED: /* Closedown begin */2635 ns=TCP_FIN_WAIT1;
2636 send_fin=1;
2637 break;
2638 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2639 caseTCP_FIN_WAIT2:
2640 caseTCP_CLOSING:
2641 ns=sk->state;
2642 break;
2643 caseTCP_CLOSE:
2644 caseTCP_LISTEN:
2645 break;
2646 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2647 wait only for the ACK */2648 ns=TCP_LAST_ACK;
2649 send_fin=1;
2650 }2651
2652 tcp_set_state(sk,ns);
2653
2654 /*2655 * This is a (useful) BSD violating of the RFC. There is a2656 * problem with TCP as specified in that the other end could2657 * keep a socket open forever with no application left this end.2658 * We use a 3 minute timeout (about the same as BSD) then kill2659 * our end. If they send after that then tough - BUT: long enough2660 * that we won't make the old 4*rto = almost no time - whoops2661 * reset mistake.2662 */2663 if(dead && ns==TCP_FIN_WAIT2)
2664 {2665 inttimer_active=del_timer(&sk->timer);
2666 if(timer_active)
2667 add_timer(&sk->timer);
2668 else2669 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2670 }2671
2672 returnsend_fin;
2673 }2674
2675 /*2676 * Send a fin.2677 */2678
2679 staticvoidtcp_send_fin(structsock *sk)
/* */2680 {2681 structproto *prot =(structproto *)sk->prot;
2682 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2683 structtcphdr *t1;
2684 structsk_buff *buff;
2685 structdevice *dev=NULL;
2686 inttmp;
2687
2688 release_sock(sk); /* in case the malloc sleeps. */2689
2690 buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2691 sk->inuse = 1;
2692
2693 if (buff == NULL)
2694 {2695 /* This is a disaster if it occurs */2696 printk("tcp_send_fin: Impossible malloc failure");
2697 return;
2698 }2699
2700 /*2701 * Administrivia2702 */2703
2704 buff->sk = sk;
2705 buff->localroute = sk->localroute;
2706
2707 /*2708 * Put in the IP header and routing stuff. 2709 */2710
2711 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2712 IPPROTO_TCP, sk->opt,
2713 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2714 if (tmp < 0)
2715 {2716 intt;
2717 /*2718 * Finish anyway, treat this as a send that got lost. 2719 * (Not good).2720 */2721
2722 buff->free = 1;
2723 sock_wfree(sk,buff);
2724 sk->write_seq++;
2725 t=del_timer(&sk->timer);
2726 if(t)
2727 add_timer(&sk->timer);
2728 else2729 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2730 return;
2731 }2732
2733 /*2734 * We ought to check if the end of the queue is a buffer and2735 * if so simply add the fin to that buffer, not send it ahead.2736 */2737
2738 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2739 buff->dev = dev;
2740 memcpy(t1, th, sizeof(*t1));
2741 buff->seq = sk->write_seq;
2742 sk->write_seq++;
2743 buff->end_seq = sk->write_seq;
2744 t1->seq = htonl(buff->seq);
2745 t1->ack = 1;
2746 t1->ack_seq = htonl(sk->acked_seq);
2747 t1->window = htons(sk->window=tcp_select_window(sk));
2748 t1->fin = 1;
2749 t1->rst = 0;
2750 t1->doff = sizeof(*t1)/4;
2751 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2752
2753 /*2754 * If there is data in the write queue, the fin must be appended to2755 * the write queue.2756 */2757
2758 if (skb_peek(&sk->write_queue) != NULL)
2759 {2760 buff->free = 0;
2761 if (buff->next != NULL)
2762 {2763 printk("tcp_send_fin: next != NULL\n");
2764 skb_unlink(buff);
2765 }2766 skb_queue_tail(&sk->write_queue, buff);
2767 }2768 else2769 {2770 sk->sent_seq = sk->write_seq;
2771 sk->prot->queue_xmit(sk, dev, buff, 0);
2772 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2773 }2774 }2775
2776 /*2777 * Shutdown the sending side of a connection. Much like close except2778 * that we don't receive shut down or set sk->dead=1.2779 */2780
2781 voidtcp_shutdown(structsock *sk, inthow)
/* */2782 {2783 /*2784 * We need to grab some memory, and put together a FIN,2785 * and then put it into the queue to be sent.2786 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2787 */2788
2789 if (!(how & SEND_SHUTDOWN))
2790 return;
2791
2792 /*2793 * If we've already sent a FIN, or it's a closed state2794 */2795
2796 if (sk->state == TCP_FIN_WAIT1 ||
2797 sk->state == TCP_FIN_WAIT2 ||
2798 sk->state == TCP_CLOSING ||
2799 sk->state == TCP_LAST_ACK ||
2800 sk->state == TCP_TIME_WAIT ||
2801 sk->state == TCP_CLOSE ||
2802 sk->state == TCP_LISTEN2803 )
2804 {2805 return;
2806 }2807 sk->inuse = 1;
2808
2809 /*2810 * flag that the sender has shutdown2811 */2812
2813 sk->shutdown |= SEND_SHUTDOWN;
2814
2815 /*2816 * Clear out any half completed packets. 2817 */2818
2819 if (sk->partial)
2820 tcp_send_partial(sk);
2821
2822 /*2823 * FIN if needed2824 */2825
2826 if(tcp_close_state(sk,0))
2827 tcp_send_fin(sk);
2828
2829 release_sock(sk);
2830 }2831
2832 /*2833 * This routine will send an RST to the other tcp. 2834 */2835
2836 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2837 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2838 {2839 structsk_buff *buff;
2840 structtcphdr *t1;
2841 inttmp;
2842 structdevice *ndev=NULL;
2843
2844 /*2845 * Cannot reset a reset (Think about it).2846 */2847
2848 if(th->rst)
2849 return;
2850
2851 /*2852 * We need to grab some memory, and put together an RST,2853 * and then put it into the queue to be sent.2854 */2855
2856 buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2857 if (buff == NULL)
2858 return;
2859
2860 buff->sk = NULL;
2861 buff->dev = dev;
2862 buff->localroute = 0;
2863
2864 /*2865 * Put in the IP header and routing stuff. 2866 */2867
2868 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2869 sizeof(structtcphdr),tos,ttl,NULL);
2870 if (tmp < 0)
2871 {2872 buff->free = 1;
2873 sock_wfree(NULL, buff);
2874 return;
2875 }2876
2877 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2878 memcpy(t1, th, sizeof(*t1));
2879
2880 /*2881 * Swap the send and the receive. 2882 */2883
2884 t1->dest = th->source;
2885 t1->source = th->dest;
2886 t1->rst = 1;
2887 t1->window = 0;
2888
2889 if(th->ack)
2890 {2891 t1->ack = 0;
2892 t1->seq = th->ack_seq;
2893 t1->ack_seq = 0;
2894 }2895 else2896 {2897 t1->ack = 1;
2898 if(!th->syn)
2899 t1->ack_seq = th->seq;
2900 else2901 t1->ack_seq = htonl(ntohl(th->seq)+1);
2902 t1->seq = 0;
2903 }2904
2905 t1->syn = 0;
2906 t1->urg = 0;
2907 t1->fin = 0;
2908 t1->psh = 0;
2909 t1->doff = sizeof(*t1)/4;
2910 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2911 prot->queue_xmit(NULL, ndev, buff, 1);
2912 tcp_statistics.TcpOutSegs++;
2913 }2914
2915
2916 /*2917 * Look for tcp options. Parses everything but only knows about MSS.2918 * This routine is always called with the packet containing the SYN.2919 * However it may also be called with the ack to the SYN. So you2920 * can't assume this is always the SYN. It's always called after2921 * we have set up sk->mtu to our own MTU.2922 *2923 * We need at minimum to add PAWS support here. Possibly large windows2924 * as Linux gets deployed on 100Mb/sec networks.2925 */2926
2927 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2928 {2929 unsignedchar *ptr;
2930 intlength=(th->doff*4)-sizeof(structtcphdr);
2931 intmss_seen = 0;
2932
2933 ptr = (unsignedchar *)(th + 1);
2934
2935 while(length>0)
2936 {2937 intopcode=*ptr++;
2938 intopsize=*ptr++;
2939 switch(opcode)
2940 {2941 caseTCPOPT_EOL:
2942 return;
2943 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2944 length--;
2945 ptr--; /* the opsize=*ptr++ above was a mistake */2946 continue;
2947
2948 default:
2949 if(opsize<=2) /* Avoid silly options looping forever */2950 return;
2951 switch(opcode)
2952 {2953 caseTCPOPT_MSS:
2954 if(opsize==4 && th->syn)
2955 {2956 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2957 mss_seen = 1;
2958 }2959 break;
2960 /* Add other options here as people feel the urge to implement stuff like large windows */2961 }2962 ptr+=opsize-2;
2963 length-=opsize;
2964 }2965 }2966 if (th->syn)
2967 {2968 if (! mss_seen)
2969 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2970 }2971 #ifdefCONFIG_INET_PCTCP2972 sk->mss = min(sk->max_window >> 1, sk->mtu);
2973 #else2974 sk->mss = min(sk->max_window, sk->mtu);
2975 sk->max_unacked = 2 * sk->mss;
2976 #endif2977 }2978
2979 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2980 {2981 dst = ntohl(dst);
2982 if (IN_CLASSA(dst))
2983 returnhtonl(IN_CLASSA_NET);
2984 if (IN_CLASSB(dst))
2985 returnhtonl(IN_CLASSB_NET);
2986 returnhtonl(IN_CLASSC_NET);
2987 }2988
2989 /*2990 * Default sequence number picking algorithm.2991 * As close as possible to RFC 793, which2992 * suggests using a 250kHz clock.2993 * Further reading shows this assumes 2MB/s networks.2994 * For 10MB/s ethernet, a 1MHz clock is appropriate.2995 * That's funny, Linux has one built in! Use it!2996 */2997
2998 externinlineu32tcp_init_seq(void)
/* */2999 {3000 structtimevaltv;
3001 do_gettimeofday(&tv);
3002 returntv.tv_usec+tv.tv_sec*1000000;
3003 }3004
3005 /*3006 * This routine handles a connection request.3007 * It should make sure we haven't already responded.3008 * Because of the way BSD works, we have to send a syn/ack now.3009 * This also means it will be harder to close a socket which is3010 * listening.3011 */3012
3013 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */3014 unsignedlongdaddr, unsignedlongsaddr,
3015 structoptions *opt, structdevice *dev, u32seq)
3016 {3017 structsk_buff *buff;
3018 structtcphdr *t1;
3019 unsignedchar *ptr;
3020 structsock *newsk;
3021 structtcphdr *th;
3022 structdevice *ndev=NULL;
3023 inttmp;
3024 structrtable *rt;
3025
3026 th = skb->h.th;
3027
3028 /* If the socket is dead, don't accept the connection. */3029 if (!sk->dead)
3030 {3031 sk->data_ready(sk,0);
3032 }3033 else3034 {3035 if(sk->debug)
3036 printk("Reset on %p: Connect on dead socket.\n",sk);
3037 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
3038 tcp_statistics.TcpAttemptFails++;
3039 kfree_skb(skb, FREE_READ);
3040 return;
3041 }3042
3043 /*3044 * Make sure we can accept more. This will prevent a3045 * flurry of syns from eating up all our memory.3046 */3047
3048 if (sk->ack_backlog >= sk->max_ack_backlog)
3049 {3050 tcp_statistics.TcpAttemptFails++;
3051 kfree_skb(skb, FREE_READ);
3052 return;
3053 }3054
3055 /*3056 * We need to build a new sock struct.3057 * It is sort of bad to have a socket without an inode attached3058 * to it, but the wake_up's will just wake up the listening socket,3059 * and if the listening socket is destroyed before this is taken3060 * off of the queue, this will take care of it.3061 */3062
3063 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
3064 if (newsk == NULL)
3065 {3066 /* just ignore the syn. It will get retransmitted. */3067 tcp_statistics.TcpAttemptFails++;
3068 kfree_skb(skb, FREE_READ);
3069 return;
3070 }3071
3072 memcpy(newsk, sk, sizeof(*newsk));
3073 newsk->opt = NULL;
3074 newsk->ip_route_cache = NULL;
3075 if (opt && opt->optlen) {3076 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
3077 if (!sk->opt) {3078 kfree_s(newsk, sizeof(structsock));
3079 tcp_statistics.TcpAttemptFails++;
3080 kfree_skb(skb, FREE_READ);
3081 return;
3082 }3083 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {3084 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
3085 kfree_s(newsk, sizeof(structsock));
3086 tcp_statistics.TcpAttemptFails++;
3087 kfree_skb(skb, FREE_READ);
3088 return;
3089 }3090 }3091 skb_queue_head_init(&newsk->write_queue);
3092 skb_queue_head_init(&newsk->receive_queue);
3093 newsk->send_head = NULL;
3094 newsk->send_tail = NULL;
3095 skb_queue_head_init(&newsk->back_log);
3096 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/3097 newsk->rto = TCP_TIMEOUT_INIT;
3098 newsk->mdev = 0;
3099 newsk->max_window = 0;
3100 newsk->cong_window = 1;
3101 newsk->cong_count = 0;
3102 newsk->ssthresh = 0;
3103 newsk->backoff = 0;
3104 newsk->blog = 0;
3105 newsk->intr = 0;
3106 newsk->proc = 0;
3107 newsk->done = 0;
3108 newsk->partial = NULL;
3109 newsk->pair = NULL;
3110 newsk->wmem_alloc = 0;
3111 newsk->rmem_alloc = 0;
3112 newsk->localroute = sk->localroute;
3113
3114 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3115
3116 newsk->err = 0;
3117 newsk->shutdown = 0;
3118 newsk->ack_backlog = 0;
3119 newsk->acked_seq = skb->seq+1;
3120 newsk->lastwin_seq = skb->seq+1;
3121 newsk->delay_acks = 1;
3122 newsk->copied_seq = skb->seq+1;
3123 newsk->fin_seq = skb->seq;
3124 newsk->state = TCP_SYN_RECV;
3125 newsk->timeout = 0;
3126 newsk->ip_xmit_timeout = 0;
3127 newsk->write_seq = seq;
3128 newsk->window_seq = newsk->write_seq;
3129 newsk->rcv_ack_seq = newsk->write_seq;
3130 newsk->urg_data = 0;
3131 newsk->retransmits = 0;
3132 newsk->linger=0;
3133 newsk->destroy = 0;
3134 init_timer(&newsk->timer);
3135 newsk->timer.data = (unsignedlong)newsk;
3136 newsk->timer.function = &net_timer;
3137 init_timer(&newsk->retransmit_timer);
3138 newsk->retransmit_timer.data = (unsignedlong)newsk;
3139 newsk->retransmit_timer.function=&retransmit_timer;
3140 newsk->dummy_th.source = skb->h.th->dest;
3141 newsk->dummy_th.dest = skb->h.th->source;
3142
3143 /*3144 * Swap these two, they are from our point of view. 3145 */3146
3147 newsk->daddr = saddr;
3148 newsk->saddr = daddr;
3149 newsk->rcv_saddr = daddr;
3150
3151 put_sock(newsk->num,newsk);
3152 newsk->dummy_th.res1 = 0;
3153 newsk->dummy_th.doff = 6;
3154 newsk->dummy_th.fin = 0;
3155 newsk->dummy_th.syn = 0;
3156 newsk->dummy_th.rst = 0;
3157 newsk->dummy_th.psh = 0;
3158 newsk->dummy_th.ack = 0;
3159 newsk->dummy_th.urg = 0;
3160 newsk->dummy_th.res2 = 0;
3161 newsk->acked_seq = skb->seq + 1;
3162 newsk->copied_seq = skb->seq + 1;
3163 newsk->socket = NULL;
3164
3165 /*3166 * Grab the ttl and tos values and use them 3167 */3168
3169 newsk->ip_ttl=sk->ip_ttl;
3170 newsk->ip_tos=skb->ip_hdr->tos;
3171
3172 /*3173 * Use 512 or whatever user asked for 3174 */3175
3176 /*3177 * Note use of sk->user_mss, since user has no direct access to newsk 3178 */3179
3180 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3181 newsk->ip_route_cache = rt;
3182
3183 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3184 newsk->window_clamp = rt->rt_window;
3185 else3186 newsk->window_clamp = 0;
3187
3188 if (sk->user_mss)
3189 newsk->mtu = sk->user_mss;
3190 elseif (rt)
3191 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
3192 else3193 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
3194
3195 /*3196 * But not bigger than device MTU 3197 */3198
3199 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
3200
3201 #ifdefCONFIG_SKIP3202
3203 /*3204 * SKIP devices set their MTU to 65535. This is so they can take packets3205 * unfragmented to security process then fragment. They could lie to the3206 * TCP layer about a suitable MTU, but its easier to let skip sort it out3207 * simply because the final package we want unfragmented is going to be3208 *3209 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]3210 */3211
3212 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */3213 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3214 #endif3215 /*3216 * This will min with what arrived in the packet 3217 */3218
3219 tcp_options(newsk,skb->h.th);
3220
3221 tcp_cache_zap();
3222
3223 buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3224 if (buff == NULL)
3225 {3226 sk->err = ENOMEM;
3227 newsk->dead = 1;
3228 newsk->state = TCP_CLOSE;
3229 /* And this will destroy it */3230 release_sock(newsk);
3231 kfree_skb(skb, FREE_READ);
3232 tcp_statistics.TcpAttemptFails++;
3233 return;
3234 }3235
3236 buff->sk = newsk;
3237 buff->localroute = newsk->localroute;
3238
3239 /*3240 * Put in the IP header and routing stuff. 3241 */3242
3243 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3244 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3245
3246 /*3247 * Something went wrong. 3248 */3249
3250 if (tmp < 0)
3251 {3252 sk->err = tmp;
3253 buff->free = 1;
3254 kfree_skb(buff,FREE_WRITE);
3255 newsk->dead = 1;
3256 newsk->state = TCP_CLOSE;
3257 release_sock(newsk);
3258 skb->sk = sk;
3259 kfree_skb(skb, FREE_READ);
3260 tcp_statistics.TcpAttemptFails++;
3261 return;
3262 }3263
3264 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
3265
3266 memcpy(t1, skb->h.th, sizeof(*t1));
3267 buff->seq = newsk->write_seq++;
3268 buff->end_seq = newsk->write_seq;
3269 /*3270 * Swap the send and the receive. 3271 */3272 t1->dest = skb->h.th->source;
3273 t1->source = newsk->dummy_th.source;
3274 t1->seq = ntohl(buff->seq);
3275 t1->ack = 1;
3276 newsk->sent_seq = newsk->write_seq;
3277 t1->window = ntohs(tcp_select_window(newsk));
3278 t1->res1 = 0;
3279 t1->res2 = 0;
3280 t1->rst = 0;
3281 t1->urg = 0;
3282 t1->psh = 0;
3283 t1->syn = 1;
3284 t1->ack_seq = htonl(newsk->acked_seq);
3285 t1->doff = sizeof(*t1)/4+1;
3286 ptr = skb_put(buff,4);
3287 ptr[0] = 2;
3288 ptr[1] = 4;
3289 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3290 ptr[3] =(newsk->mtu) & 0xff;
3291
3292 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3293 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3294 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3295 skb->sk = newsk;
3296
3297 /*3298 * Charge the sock_buff to newsk. 3299 */3300
3301 sk->rmem_alloc -= skb->truesize;
3302 newsk->rmem_alloc += skb->truesize;
3303
3304 skb_queue_tail(&sk->receive_queue,skb);
3305 sk->ack_backlog++;
3306 release_sock(newsk);
3307 tcp_statistics.TcpOutSegs++;
3308 }3309
3310
3311 staticvoidtcp_close(structsock *sk, inttimeout)
/* */3312 {3313 /*3314 * We need to grab some memory, and put together a FIN, 3315 * and then put it into the queue to be sent.3316 */3317
3318 sk->inuse = 1;
3319
3320 if(th_cache_sk==sk)
3321 tcp_cache_zap();
3322 if(sk->state == TCP_LISTEN)
3323 {3324 /* Special case */3325 tcp_set_state(sk, TCP_CLOSE);
3326 tcp_close_pending(sk);
3327 release_sock(sk);
3328 return;
3329 }3330
3331 sk->keepopen = 1;
3332 sk->shutdown = SHUTDOWN_MASK;
3333
3334 if (!sk->dead)
3335 sk->state_change(sk);
3336
3337 if (timeout == 0)
3338 {3339 structsk_buff *skb;
3340
3341 /*3342 * We need to flush the recv. buffs. We do this only on the3343 * descriptor close, not protocol-sourced closes, because the3344 * reader process may not have drained the data yet!3345 */3346
3347 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3348 kfree_skb(skb, FREE_READ);
3349 /*3350 * Get rid off any half-completed packets. 3351 */3352
3353 if (sk->partial)
3354 tcp_send_partial(sk);
3355 }3356
3357
3358 /*3359 * Timeout is not the same thing - however the code likes3360 * to send both the same way (sigh).3361 */3362
3363 if(timeout)
3364 {3365 tcp_set_state(sk, TCP_CLOSE); /* Dead */3366 }3367 else3368 {3369 if(tcp_close_state(sk,1)==1)
3370 {3371 tcp_send_fin(sk);
3372 }3373 }3374 release_sock(sk);
3375 }3376
3377
3378 /*3379 * This routine takes stuff off of the write queue,3380 * and puts it in the xmit queue. This happens as incoming acks3381 * open up the remote window for us.3382 */3383
3384 staticvoidtcp_write_xmit(structsock *sk)
/* */3385 {3386 structsk_buff *skb;
3387
3388 /*3389 * The bytes will have to remain here. In time closedown will3390 * empty the write queue and all will be happy 3391 */3392
3393 if(sk->zapped)
3394 return;
3395
3396 /*3397 * Anything on the transmit queue that fits the window can3398 * be added providing we are not3399 *3400 * a) retransmitting (Nagle's rule)3401 * b) exceeding our congestion window.3402 */3403
3404 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3405 before(skb->end_seq, sk->window_seq + 1) &&
3406 (sk->retransmits == 0 ||
3407 sk->ip_xmit_timeout != TIME_WRITE ||
3408 before(skb->end_seq, sk->rcv_ack_seq + 1))
3409 && sk->packets_out < sk->cong_window)
3410 {3411 IS_SKB(skb);
3412 skb_unlink(skb);
3413
3414 /*3415 * See if we really need to send the packet. 3416 */3417
3418 if (before(skb->end_seq, sk->rcv_ack_seq +1))
3419 {3420 /*3421 * This is acked data. We can discard it. This 3422 * cannot currently occur.3423 */3424
3425 sk->retransmits = 0;
3426 kfree_skb(skb, FREE_WRITE);
3427 if (!sk->dead)
3428 sk->write_space(sk);
3429 }3430 else3431 {3432 structtcphdr *th;
3433 structiphdr *iph;
3434 intsize;
3435 /*3436 * put in the ack seq and window at this point rather than earlier,3437 * in order to keep them monotonic. We really want to avoid taking3438 * back window allocations. That's legal, but RFC1122 says it's frowned on.3439 * Ack and window will in general have changed since this packet was put3440 * on the write queue.3441 */3442 iph = skb->ip_hdr;
3443 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3444 size = skb->len - (((unsignedchar *) th) - skb->data);
3445 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY3446 if (size > sk->mtu - sizeof(structiphdr))
3447 {3448 iph->frag_off &= ~htons(IP_DF);
3449 ip_send_check(iph);
3450 }3451 #endif3452
3453 th->ack_seq = htonl(sk->acked_seq);
3454 th->window = htons(tcp_select_window(sk));
3455
3456 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3457
3458 sk->sent_seq = skb->end_seq;
3459
3460 /*3461 * IP manages our queue for some crazy reason3462 */3463
3464 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3465
3466
3467 sk->ack_backlog = 0;
3468 sk->bytes_rcv = 0;
3469
3470 /*3471 * Again we slide the timer wrongly3472 */3473
3474 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3475 }3476 }3477 }3478
3479
3480 /*3481 * This routine deals with incoming acks, but not outgoing ones.3482 */3483
3484 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3485 {3486 u32ack;
3487 intflag = 0;
3488
3489 /* 3490 * 1 - there was data in packet as well as ack or new data is sent or 3491 * in shutdown state3492 * 2 - data from retransmit queue was acked and removed3493 * 4 - window shrunk or data from retransmit queue was acked and removed3494 */3495
3496 if(sk->zapped)
3497 return(1); /* Dead, cant ack any more so why bother */3498
3499 /*3500 * Have we discovered a larger window3501 */3502
3503 ack = ntohl(th->ack_seq);
3504
3505 if (ntohs(th->window) > sk->max_window)
3506 {3507 sk->max_window = ntohs(th->window);
3508 #ifdefCONFIG_INET_PCTCP3509 /* Hack because we don't send partial packets to non SWS3510 handling hosts */3511 sk->mss = min(sk->max_window>>1, sk->mtu);
3512 #else3513 sk->mss = min(sk->max_window, sk->mtu);
3514 #endif3515 }3516
3517 /*3518 * We have dropped back to keepalive timeouts. Thus we have3519 * no retransmits pending.3520 */3521
3522 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3523 sk->retransmits = 0;
3524
3525 /*3526 * If the ack is newer than sent or older than previous acks3527 * then we can probably ignore it.3528 */3529
3530 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3531 {3532 if(sk->debug)
3533 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3534
3535 /*3536 * Keepalive processing.3537 */3538
3539 if (after(ack, sk->sent_seq))
3540 {3541 return(0);
3542 }3543
3544 /*3545 * Restart the keepalive timer.3546 */3547
3548 if (sk->keepopen)
3549 {3550 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3551 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3552 }3553 return(1);
3554 }3555
3556 /*3557 * If there is data set flag 13558 */3559
3560 if (len != th->doff*4)
3561 flag |= 1;
3562
3563 /*3564 * See if our window has been shrunk. 3565 */3566
3567 if (after(sk->window_seq, ack+ntohs(th->window)))
3568 {3569 /*3570 * We may need to move packets from the send queue3571 * to the write queue, if the window has been shrunk on us.3572 * The RFC says you are not allowed to shrink your window3573 * like this, but if the other end does, you must be able3574 * to deal with it.3575 */3576 structsk_buff *skb;
3577 structsk_buff *skb2;
3578 structsk_buff *wskb = NULL;
3579
3580 skb2 = sk->send_head;
3581 sk->send_head = NULL;
3582 sk->send_tail = NULL;
3583
3584 /*3585 * This is an artifact of a flawed concept. We want one3586 * queue and a smarter send routine when we send all.3587 */3588
3589 flag |= 4; /* Window changed */3590
3591 sk->window_seq = ack + ntohs(th->window);
3592 cli();
3593 while (skb2 != NULL)
3594 {3595 skb = skb2;
3596 skb2 = skb->link3;
3597 skb->link3 = NULL;
3598 if (after(skb->end_seq, sk->window_seq))
3599 {3600 if (sk->packets_out > 0)
3601 sk->packets_out--;
3602 /* We may need to remove this from the dev send list. */3603 if (skb->next != NULL)
3604 {3605 skb_unlink(skb);
3606 }3607 /* Now add it to the write_queue. */3608 if (wskb == NULL)
3609 skb_queue_head(&sk->write_queue,skb);
3610 else3611 skb_append(wskb,skb);
3612 wskb = skb;
3613 }3614 else3615 {3616 if (sk->send_head == NULL)
3617 {3618 sk->send_head = skb;
3619 sk->send_tail = skb;
3620 }3621 else3622 {3623 sk->send_tail->link3 = skb;
3624 sk->send_tail = skb;
3625 }3626 skb->link3 = NULL;
3627 }3628 }3629 sti();
3630 }3631
3632 /*3633 * Pipe has emptied3634 */3635
3636 if (sk->send_tail == NULL || sk->send_head == NULL)
3637 {3638 sk->send_head = NULL;
3639 sk->send_tail = NULL;
3640 sk->packets_out= 0;
3641 }3642
3643 /*3644 * Update the right hand window edge of the host3645 */3646
3647 sk->window_seq = ack + ntohs(th->window);
3648
3649 /*3650 * We don't want too many packets out there. 3651 */3652
3653 if (sk->ip_xmit_timeout == TIME_WRITE &&
3654 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3655 {3656 /* 3657 * This is Jacobson's slow start and congestion avoidance. 3658 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3659 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3660 * counter and increment it once every cwnd times. It's possible3661 * that this should be done only if sk->retransmits == 0. I'm3662 * interpreting "new data is acked" as including data that has3663 * been retransmitted but is just now being acked.3664 */3665 if (sk->cong_window < sk->ssthresh)
3666 /* 3667 * In "safe" area, increase3668 */3669 sk->cong_window++;
3670 else3671 {3672 /*3673 * In dangerous area, increase slowly. In theory this is3674 * sk->cong_window += 1 / sk->cong_window3675 */3676 if (sk->cong_count >= sk->cong_window)
3677 {3678 sk->cong_window++;
3679 sk->cong_count = 0;
3680 }3681 else3682 sk->cong_count++;
3683 }3684 }3685
3686 /*3687 * Remember the highest ack received.3688 */3689
3690 sk->rcv_ack_seq = ack;
3691
3692 /*3693 * We passed data and got it acked, remove any soft error3694 * log. Something worked...3695 */3696
3697 sk->err_soft = 0;
3698
3699 /*3700 * If this ack opens up a zero window, clear backoff. It was3701 * being used to time the probes, and is probably far higher than3702 * it needs to be for normal retransmission.3703 */3704
3705 if (sk->ip_xmit_timeout == TIME_PROBE0)
3706 {3707 sk->retransmits = 0; /* Our probe was answered */3708
3709 /*3710 * Was it a usable window open ?3711 */3712
3713 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3714 ! before (sk->window_seq, sk->write_queue.next->end_seq))
3715 {3716 sk->backoff = 0;
3717
3718 /*3719 * Recompute rto from rtt. this eliminates any backoff.3720 */3721
3722 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3723 if (sk->rto > 120*HZ)
3724 sk->rto = 120*HZ;
3725 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about3726 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3727 .2 of a second is going to need huge windows (SIGH) */3728 sk->rto = HZ/5;
3729 }3730 }3731
3732 /* 3733 * See if we can take anything off of the retransmit queue.3734 */3735
3736 while(sk->send_head != NULL)
3737 {3738 /* Check for a bug. */3739 if (sk->send_head->link3 &&
3740 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
3741 printk("INET: tcp.c: *** bug send_list out of order.\n");
3742
3743 /*3744 * If our packet is before the ack sequence we can3745 * discard it as it's confirmed to have arrived the other end.3746 */3747
3748 if (before(sk->send_head->end_seq, ack+1))
3749 {3750 structsk_buff *oskb;
3751 if (sk->retransmits)
3752 {3753 /*3754 * We were retransmitting. don't count this in RTT est 3755 */3756 flag |= 2;
3757
3758 /*3759 * even though we've gotten an ack, we're still3760 * retransmitting as long as we're sending from3761 * the retransmit queue. Keeping retransmits non-zero3762 * prevents us from getting new data interspersed with3763 * retransmissions.3764 */3765
3766 if (sk->send_head->link3) /* Any more queued retransmits? */3767 sk->retransmits = 1;
3768 else3769 sk->retransmits = 0;
3770 }3771 /*3772 * Note that we only reset backoff and rto in the3773 * rtt recomputation code. And that doesn't happen3774 * if there were retransmissions in effect. So the3775 * first new packet after the retransmissions is3776 * sent with the backoff still in effect. Not until3777 * we get an ack from a non-retransmitted packet do3778 * we reset the backoff and rto. This allows us to deal3779 * with a situation where the network delay has increased3780 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3781 */3782
3783 /*3784 * We have one less packet out there. 3785 */3786
3787 if (sk->packets_out > 0)
3788 sk->packets_out --;
3789 /* 3790 * Wake up the process, it can probably write more. 3791 */3792 if (!sk->dead)
3793 sk->write_space(sk);
3794 oskb = sk->send_head;
3795
3796 if (!(flag&2)) /* Not retransmitting */3797 {3798 longm;
3799
3800 /*3801 * The following amusing code comes from Jacobson's3802 * article in SIGCOMM '88. Note that rtt and mdev3803 * are scaled versions of rtt and mean deviation.3804 * This is designed to be as fast as possible 3805 * m stands for "measurement".3806 */3807
3808 m = jiffies - oskb->when; /* RTT */3809 if(m<=0)
3810 m=1; /* IS THIS RIGHT FOR <0 ??? */3811 m -= (sk->rtt >> 3); /* m is now error in rtt est */3812 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3813 if (m < 0)
3814 m = -m; /* m is now abs(error) */3815 m -= (sk->mdev >> 2); /* similar update on mdev */3816 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3817
3818 /*3819 * Now update timeout. Note that this removes any backoff.3820 */3821
3822 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3823 if (sk->rto > 120*HZ)
3824 sk->rto = 120*HZ;
3825 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3826 sk->rto = HZ/5;
3827 sk->backoff = 0;
3828 }3829 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3830 In this case as we just set it up */3831 cli();
3832 oskb = sk->send_head;
3833 IS_SKB(oskb);
3834 sk->send_head = oskb->link3;
3835 if (sk->send_head == NULL)
3836 {3837 sk->send_tail = NULL;
3838 }3839
3840 /*3841 * We may need to remove this from the dev send list. 3842 */3843
3844 if (oskb->next)
3845 skb_unlink(oskb);
3846 sti();
3847 kfree_skb(oskb, FREE_WRITE); /* write. */3848 if (!sk->dead)
3849 sk->write_space(sk);
3850 }3851 else3852 {3853 break;
3854 }3855 }3856
3857 /*3858 * XXX someone ought to look at this too.. at the moment, if skb_peek()3859 * returns non-NULL, we complete ignore the timer stuff in the else3860 * clause. We ought to organize the code so that else clause can3861 * (should) be executed regardless, possibly moving the PROBE timer3862 * reset over. The skb_peek() thing should only move stuff to the3863 * write queue, NOT also manage the timer functions.3864 */3865
3866 /*3867 * Maybe we can take some stuff off of the write queue,3868 * and put it onto the xmit queue.3869 */3870 if (skb_peek(&sk->write_queue) != NULL)
3871 {3872 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3873 (sk->retransmits == 0 ||
3874 sk->ip_xmit_timeout != TIME_WRITE ||
3875 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3876 && sk->packets_out < sk->cong_window)
3877 {3878 /*3879 * Add more data to the send queue.3880 */3881 flag |= 1;
3882 tcp_write_xmit(sk);
3883 }3884 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3885 sk->send_head == NULL &&
3886 sk->ack_backlog == 0 &&
3887 sk->state != TCP_TIME_WAIT)
3888 {3889 /*3890 * Data to queue but no room.3891 */3892 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3893 }3894 }3895 else3896 {3897 /*3898 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3899 * from TCP_CLOSE we don't do anything3900 *3901 * from anything else, if there is write data (or fin) pending,3902 * we use a TIME_WRITE timeout, else if keepalive we reset to3903 * a KEEPALIVE timeout, else we delete the timer.3904 *3905 * We do not set flag for nominal write data, otherwise we may3906 * force a state where we start to write itsy bitsy tidbits3907 * of data.3908 */3909
3910 switch(sk->state) {3911 caseTCP_TIME_WAIT:
3912 /*3913 * keep us in TIME_WAIT until we stop getting packets,3914 * reset the timeout.3915 */3916 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3917 break;
3918 caseTCP_CLOSE:
3919 /*3920 * don't touch the timer.3921 */3922 break;
3923 default:
3924 /*3925 * Must check send_head, write_queue, and ack_backlog3926 * to determine which timeout to use.3927 */3928 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3929 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3930 }elseif (sk->keepopen) {3931 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3932 }else{3933 del_timer(&sk->retransmit_timer);
3934 sk->ip_xmit_timeout = 0;
3935 }3936 break;
3937 }3938 }3939
3940 /*3941 * We have nothing queued but space to send. Send any partial3942 * packets immediately (end of Nagle rule application).3943 */3944
3945 if (sk->packets_out == 0 && sk->partial != NULL &&
3946 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3947 {3948 flag |= 1;
3949 tcp_send_partial(sk);
3950 }3951
3952 /*3953 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3954 * we are now waiting for an acknowledge to our FIN. The other end is3955 * already in TIME_WAIT.3956 *3957 * Move to TCP_CLOSE on success.3958 */3959
3960 if (sk->state == TCP_LAST_ACK)
3961 {3962 if (!sk->dead)
3963 sk->state_change(sk);
3964 if(sk->debug)
3965 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3966 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3967 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3968 {3969 flag |= 1;
3970 sk->shutdown = SHUTDOWN_MASK;
3971 tcp_set_state(sk,TCP_CLOSE);
3972 return 1;
3973 }3974 }3975
3976 /*3977 * Incoming ACK to a FIN we sent in the case of our initiating the close.3978 *3979 * Move to FIN_WAIT2 to await a FIN from the other end. Set3980 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3981 */3982
3983 if (sk->state == TCP_FIN_WAIT1)
3984 {3985
3986 if (!sk->dead)
3987 sk->state_change(sk);
3988 if (sk->rcv_ack_seq == sk->write_seq)
3989 {3990 flag |= 1;
3991 sk->shutdown |= SEND_SHUTDOWN;
3992 tcp_set_state(sk, TCP_FIN_WAIT2);
3993 }3994 }3995
3996 /*3997 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3998 *3999 * Move to TIME_WAIT4000 */4001
4002 if (sk->state == TCP_CLOSING)
4003 {4004
4005 if (!sk->dead)
4006 sk->state_change(sk);
4007 if (sk->rcv_ack_seq == sk->write_seq)
4008 {4009 flag |= 1;
4010 tcp_time_wait(sk);
4011 }4012 }4013
4014 /*4015 * Final ack of a three way shake 4016 */4017
4018 if(sk->state==TCP_SYN_RECV)
4019 {4020 tcp_set_state(sk, TCP_ESTABLISHED);
4021 tcp_options(sk,th);
4022 sk->dummy_th.dest=th->source;
4023 sk->copied_seq = sk->acked_seq;
4024 if(!sk->dead)
4025 sk->state_change(sk);
4026 if(sk->max_window==0)
4027 {4028 sk->max_window=32; /* Sanity check */4029 sk->mss=min(sk->max_window,sk->mtu);
4030 }4031 }4032
4033 /*4034 * I make no guarantees about the first clause in the following4035 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under4036 * what conditions "!flag" would be true. However I think the rest4037 * of the conditions would prevent that from causing any4038 * unnecessary retransmission. 4039 * Clearly if the first packet has expired it should be 4040 * retransmitted. The other alternative, "flag&2 && retransmits", is4041 * harder to explain: You have to look carefully at how and when the4042 * timer is set and with what timeout. The most recent transmission always4043 * sets the timer. So in general if the most recent thing has timed4044 * out, everything before it has as well. So we want to go ahead and4045 * retransmit some more. If we didn't explicitly test for this4046 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"4047 * would not be true. If you look at the pattern of timing, you can4048 * show that rto is increased fast enough that the next packet would4049 * almost never be retransmitted immediately. Then you'd end up4050 * waiting for a timeout to send each packet on the retransmission4051 * queue. With my implementation of the Karn sampling algorithm,4052 * the timeout would double each time. The net result is that it would4053 * take a hideous amount of time to recover from a single dropped packet.4054 * It's possible that there should also be a test for TIME_WRITE, but4055 * I think as long as "send_head != NULL" and "retransmit" is on, we've4056 * got to be in real retransmission mode.4057 * Note that tcp_do_retransmit is called with all==1. Setting cong_window4058 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.4059 * As long as no further losses occur, this seems reasonable.4060 */4061
4062 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
4063 (((flag&2) && sk->retransmits) ||
4064 (sk->send_head->when + sk->rto < jiffies)))
4065 {4066 if(sk->send_head->when + sk->rto < jiffies)
4067 tcp_retransmit(sk,0);
4068 else4069 {4070 tcp_do_retransmit(sk, 1);
4071 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4072 }4073 }4074
4075 return(1);
4076 }4077
4078
4079 /*4080 * Process the FIN bit. This now behaves as it is supposed to work4081 * and the FIN takes effect when it is validly part of sequence4082 * space. Not before when we get holes.4083 *4084 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT4085 * (and thence onto LAST-ACK and finally, CLOSE, we never enter4086 * TIME-WAIT)4087 *4088 * If we are in FINWAIT-1, a received FIN indicates simultaneous4089 * close and we go into CLOSING (and later onto TIME-WAIT)4090 *4091 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.4092 *4093 */4094
4095 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */4096 {4097 sk->fin_seq = skb->end_seq;
4098
4099 if (!sk->dead)
4100 {4101 sk->state_change(sk);
4102 sock_wake_async(sk->socket, 1);
4103 }4104
4105 switch(sk->state)
4106 {4107 caseTCP_SYN_RECV:
4108 caseTCP_SYN_SENT:
4109 caseTCP_ESTABLISHED:
4110 /*4111 * move to CLOSE_WAIT, tcp_data() already handled4112 * sending the ack.4113 */4114 tcp_set_state(sk,TCP_CLOSE_WAIT);
4115 if (th->rst)
4116 sk->shutdown = SHUTDOWN_MASK;
4117 break;
4118
4119 caseTCP_CLOSE_WAIT:
4120 caseTCP_CLOSING:
4121 /*4122 * received a retransmission of the FIN, do4123 * nothing.4124 */4125 break;
4126 caseTCP_TIME_WAIT:
4127 /*4128 * received a retransmission of the FIN,4129 * restart the TIME_WAIT timer.4130 */4131 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4132 return(0);
4133 caseTCP_FIN_WAIT1:
4134 /*4135 * This case occurs when a simultaneous close4136 * happens, we must ack the received FIN and4137 * enter the CLOSING state.4138 *4139 * This causes a WRITE timeout, which will either4140 * move on to TIME_WAIT when we timeout, or resend4141 * the FIN properly (maybe we get rid of that annoying4142 * FIN lost hang). The TIME_WRITE code is already correct4143 * for handling this timeout.4144 */4145
4146 if(sk->ip_xmit_timeout != TIME_WRITE)
4147 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4148 tcp_set_state(sk,TCP_CLOSING);
4149 break;
4150 caseTCP_FIN_WAIT2:
4151 /*4152 * received a FIN -- send ACK and enter TIME_WAIT4153 */4154 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4155 sk->shutdown|=SHUTDOWN_MASK;
4156 tcp_set_state(sk,TCP_TIME_WAIT);
4157 break;
4158 caseTCP_CLOSE:
4159 /*4160 * already in CLOSE4161 */4162 break;
4163 default:
4164 tcp_set_state(sk,TCP_LAST_ACK);
4165
4166 /* Start the timers. */4167 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4168 return(0);
4169 }4170
4171 return(0);
4172 }4173
4174
4175
4176 /*4177 * This routine handles the data. If there is room in the buffer,4178 * it will be have already been moved into it. If there is no4179 * room, then we will just have to discard the packet.4180 */4181
4182 extern/* __inline__ */inttcp_data(structsk_buff *skb, structsock *sk,
/* */4183 unsignedlongsaddr, unsignedshortlen)
4184 {4185 structsk_buff *skb1, *skb2;
4186 structtcphdr *th;
4187 intdup_dumped=0;
4188 u32new_seq, shut_seq;
4189
4190 th = skb->h.th;
4191 skb_pull(skb,th->doff*4);
4192 skb_trim(skb,len-(th->doff*4));
4193
4194 /*4195 * The bytes in the receive read/assembly queue has increased. Needed for the4196 * low memory discard algorithm 4197 */4198
4199 sk->bytes_rcv += skb->len;
4200
4201 if (skb->len == 0 && !th->fin)
4202 {4203 /* 4204 * Don't want to keep passing ack's back and forth. 4205 * (someone sent us dataless, boring frame)4206 */4207 if (!th->ack)
4208 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4209 kfree_skb(skb, FREE_READ);
4210 return(0);
4211 }4212
4213 /*4214 * We no longer have anyone receiving data on this connection.4215 */4216
4217 #ifndef TCP_DONT_RST_SHUTDOWN
4218
4219 if(sk->shutdown & RCV_SHUTDOWN)
4220 {4221 /*4222 * FIXME: BSD has some magic to avoid sending resets to4223 * broken 4.2 BSD keepalives. Much to my surprise a few non4224 * BSD stacks still have broken keepalives so we want to4225 * cope with it.4226 */4227
4228 if(skb->len) /* We don't care if it's just an ack or4229 a keepalive/window probe */4230 {4231 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */4232
4233 /* Do this the way 4.4BSD treats it. Not what I'd4234 regard as the meaning of the spec but it's what BSD4235 does and clearly they know everything 8) */4236
4237 /*4238 * This is valid because of two things4239 *4240 * a) The way tcp_data behaves at the bottom.4241 * b) A fin takes effect when read not when received.4242 */4243
4244 shut_seq = sk->acked_seq+1; /* Last byte */4245
4246 if(after(new_seq,shut_seq))
4247 {4248 if(sk->debug)
4249 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4250 sk, new_seq, shut_seq, sk->blog);
4251 if(sk->dead)
4252 {4253 sk->acked_seq = new_seq + th->fin;
4254 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4255 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4256 tcp_statistics.TcpEstabResets++;
4257 sk->err = EPIPE;
4258 sk->error_report(sk);
4259 sk->shutdown = SHUTDOWN_MASK;
4260 tcp_set_state(sk,TCP_CLOSE);
4261 kfree_skb(skb, FREE_READ);
4262 return 0;
4263 }4264 }4265 }4266 }4267
4268 #endif4269
4270 /*4271 * Now we have to walk the chain, and figure out where this one4272 * goes into it. This is set up so that the last packet we received4273 * will be the first one we look at, that way if everything comes4274 * in order, there will be no performance loss, and if they come4275 * out of order we will be able to fit things in nicely.4276 *4277 * [AC: This is wrong. We should assume in order first and then walk4278 * forwards from the first hole based upon real traffic patterns.]4279 * 4280 */4281
4282 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */4283 {4284 skb_queue_head(&sk->receive_queue,skb);
4285 skb1= NULL;
4286 }4287 else4288 {4289 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4290 {4291 if(sk->debug)
4292 {4293 printk("skb1=%p :", skb1);
4294 printk("skb1->seq = %d: ", skb1->seq);
4295 printk("skb->seq = %d\n",skb->seq);
4296 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4297 sk->acked_seq);
4298 }4299
4300 /*4301 * Optimisation: Duplicate frame or extension of previous frame from4302 * same sequence point (lost ack case).4303 * The frame contains duplicate data or replaces a previous frame4304 * discard the previous frame (safe as sk->inuse is set) and put4305 * the new one in its place.4306 */4307
4308 if (skb->seq==skb1->seq && skb->len>=skb1->len)
4309 {4310 skb_append(skb1,skb);
4311 skb_unlink(skb1);
4312 kfree_skb(skb1,FREE_READ);
4313 dup_dumped=1;
4314 skb1=NULL;
4315 break;
4316 }4317
4318 /*4319 * Found where it fits4320 */4321
4322 if (after(skb->seq+1, skb1->seq))
4323 {4324 skb_append(skb1,skb);
4325 break;
4326 }4327
4328 /*4329 * See if we've hit the start. If so insert.4330 */4331 if (skb1 == skb_peek(&sk->receive_queue))
4332 {4333 skb_queue_head(&sk->receive_queue, skb);
4334 break;
4335 }4336 }4337 }4338
4339 /*4340 * Figure out what the ack value for this frame is4341 */4342
4343 if (before(sk->acked_seq, sk->copied_seq))
4344 {4345 printk("*** tcp.c:tcp_data bug acked < copied\n");
4346 sk->acked_seq = sk->copied_seq;
4347 }4348
4349 /*4350 * Now figure out if we can ack anything. This is very messy because we really want two4351 * receive queues, a completed and an assembly queue. We also want only one transmit4352 * queue.4353 */4354
4355 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
4356 {4357 if (before(skb->seq, sk->acked_seq+1))
4358 {4359
4360 if (after(skb->end_seq, sk->acked_seq))
4361 sk->acked_seq = skb->end_seq;
4362
4363 skb->acked = 1;
4364
4365 /*4366 * When we ack the fin, we do the FIN 4367 * processing.4368 */4369
4370 if (skb->h.th->fin)
4371 {4372 tcp_fin(skb,sk,skb->h.th);
4373 }4374
4375 for(skb2 = skb->next;
4376 skb2 != (structsk_buff *)&sk->receive_queue;
4377 skb2 = skb2->next)
4378 {4379 if (before(skb2->seq, sk->acked_seq+1))
4380 {4381 if (after(skb2->end_seq, sk->acked_seq))
4382 sk->acked_seq = skb2->end_seq;
4383
4384 skb2->acked = 1;
4385 /*4386 * When we ack the fin, we do4387 * the fin handling.4388 */4389 if (skb2->h.th->fin)
4390 {4391 tcp_fin(skb,sk,skb->h.th);
4392 }4393
4394 /*4395 * Force an immediate ack.4396 */4397
4398 sk->ack_backlog = sk->max_ack_backlog;
4399 }4400 else4401 {4402 break;
4403 }4404 }4405
4406 /*4407 * This also takes care of updating the window.4408 * This if statement needs to be simplified.4409 *4410 * rules for delaying an ack:4411 * - delay time <= 0.5 HZ4412 * - we don't have a window update to send4413 * - must send at least every 2 full sized packets4414 */4415 if (!sk->delay_acks ||
4416 sk->ack_backlog >= sk->max_ack_backlog ||
4417 sk->bytes_rcv > sk->max_unacked || th->fin ||
4418 sk->ato > HZ/2 ||
4419 tcp_raise_window(sk)) {4420 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4421 }4422 else4423 {4424 sk->ack_backlog++;
4425
4426 if(sk->debug)
4427 printk("Ack queued.\n");
4428 reset_xmit_timer(sk, TIME_WRITE, sk->ato);
4429 }4430 }4431 }4432
4433 /*4434 * If we've missed a packet, send an ack.4435 * Also start a timer to send another.4436 */4437
4438 if (!skb->acked)
4439 {4440
4441 /*4442 * This is important. If we don't have much room left,4443 * we need to throw out a few packets so we have a good4444 * window. Note that mtu is used, not mss, because mss is really4445 * for the send side. He could be sending us stuff as large as mtu.4446 */4447
4448 while (sock_rspace(sk) < sk->mtu)
4449 {4450 skb1 = skb_peek(&sk->receive_queue);
4451 if (skb1 == NULL)
4452 {4453 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4454 break;
4455 }4456
4457 /*4458 * Don't throw out something that has been acked. 4459 */4460
4461 if (skb1->acked)
4462 {4463 break;
4464 }4465
4466 skb_unlink(skb1);
4467 kfree_skb(skb1, FREE_READ);
4468 }4469 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4470 sk->ack_backlog++;
4471 reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
4472 }4473 else4474 {4475 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4476 }4477
4478 /*4479 * Now tell the user we may have some data. 4480 */4481
4482 if (!sk->dead)
4483 {4484 if(sk->debug)
4485 printk("Data wakeup.\n");
4486 sk->data_ready(sk,0);
4487 }4488 return(0);
4489 }4490
4491
4492 /*4493 * This routine is only called when we have urgent data4494 * signalled. Its the 'slow' part of tcp_urg. It could be4495 * moved inline now as tcp_urg is only called from one4496 * place. We handle URGent data wrong. We have to - as4497 * BSD still doesn't use the correction from RFC961.4498 */4499
4500 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4501 {4502 u32ptr = ntohs(th->urg_ptr);
4503
4504 if (ptr)
4505 ptr--;
4506 ptr += ntohl(th->seq);
4507
4508 /* ignore urgent data that we've already seen and read */4509 if (after(sk->copied_seq, ptr))
4510 return;
4511
4512 /* do we already have a newer (or duplicate) urgent pointer? */4513 if (sk->urg_data && !after(ptr, sk->urg_seq))
4514 return;
4515
4516 /* tell the world about our new urgent pointer */4517 if (sk->proc != 0) {4518 if (sk->proc > 0) {4519 kill_proc(sk->proc, SIGURG, 1);
4520 }else{4521 kill_pg(-sk->proc, SIGURG, 1);
4522 }4523 }4524 sk->urg_data = URG_NOTYET;
4525 sk->urg_seq = ptr;
4526 }4527
4528 /*4529 * This is the 'fast' part of urgent handling.4530 */4531
4532 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4533 unsignedlongsaddr, unsignedlonglen)
4534 {4535 u32ptr;
4536
4537 /*4538 * Check if we get a new urgent pointer - normally not 4539 */4540
4541 if (th->urg)
4542 tcp_check_urg(sk,th);
4543
4544 /*4545 * Do we wait for any urgent data? - normally not4546 */4547
4548 if (sk->urg_data != URG_NOTYET)
4549 return 0;
4550
4551 /*4552 * Is the urgent pointer pointing into this packet? 4553 */4554
4555 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4556 if (ptr >= len)
4557 return 0;
4558
4559 /*4560 * Ok, got the correct packet, update info 4561 */4562
4563 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4564 if (!sk->dead)
4565 sk->data_ready(sk,0);
4566 return 0;
4567 }4568
4569 /*4570 * This will accept the next outstanding connection. 4571 */4572
4573 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4574 {4575 structsock *newsk;
4576 structsk_buff *skb;
4577
4578 /*4579 * We need to make sure that this socket is listening,4580 * and that it has something pending.4581 */4582
4583 if (sk->state != TCP_LISTEN)
4584 {4585 sk->err = EINVAL;
4586 return(NULL);
4587 }4588
4589 /* Avoid the race. */4590 cli();
4591 sk->inuse = 1;
4592
4593 while((skb = tcp_dequeue_established(sk)) == NULL)
4594 {4595 if (flags & O_NONBLOCK)
4596 {4597 sti();
4598 release_sock(sk);
4599 sk->err = EAGAIN;
4600 return(NULL);
4601 }4602
4603 release_sock(sk);
4604 interruptible_sleep_on(sk->sleep);
4605 if (current->signal & ~current->blocked)
4606 {4607 sti();
4608 sk->err = ERESTARTSYS;
4609 return(NULL);
4610 }4611 sk->inuse = 1;
4612 }4613 sti();
4614
4615 /*4616 * Now all we need to do is return skb->sk. 4617 */4618
4619 newsk = skb->sk;
4620
4621 kfree_skb(skb, FREE_READ);
4622 sk->ack_backlog--;
4623 release_sock(sk);
4624 return(newsk);
4625 }4626
4627
4628 /*4629 * This will initiate an outgoing connection. 4630 */4631
4632 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4633 {4634 structsk_buff *buff;
4635 structdevice *dev=NULL;
4636 unsignedchar *ptr;
4637 inttmp;
4638 intatype;
4639 structtcphdr *t1;
4640 structrtable *rt;
4641
4642 if (sk->state != TCP_CLOSE)
4643 return(-EISCONN);
4644
4645 /*4646 * Don't allow a double connect.4647 */4648
4649 if(sk->daddr)
4650 return -EINVAL;
4651
4652 if (addr_len < 8)
4653 return(-EINVAL);
4654
4655 if (usin->sin_family && usin->sin_family != AF_INET)
4656 return(-EAFNOSUPPORT);
4657
4658 /*4659 * connect() to INADDR_ANY means loopback (BSD'ism).4660 */4661
4662 if(usin->sin_addr.s_addr==INADDR_ANY)
4663 usin->sin_addr.s_addr=ip_my_addr();
4664
4665 /*4666 * Don't want a TCP connection going to a broadcast address 4667 */4668
4669 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4670 return -ENETUNREACH;
4671
4672 sk->inuse = 1;
4673 sk->daddr = usin->sin_addr.s_addr;
4674 sk->write_seq = tcp_init_seq();
4675 sk->window_seq = sk->write_seq;
4676 sk->rcv_ack_seq = sk->write_seq -1;
4677 sk->err = 0;
4678 sk->dummy_th.dest = usin->sin_port;
4679 release_sock(sk);
4680
4681 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4682 if (buff == NULL)
4683 {4684 return(-ENOMEM);
4685 }4686 sk->inuse = 1;
4687 buff->sk = sk;
4688 buff->free = 0;
4689 buff->localroute = sk->localroute;
4690
4691
4692 /*4693 * Put in the IP header and routing stuff.4694 */4695
4696 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4697 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4698 if (tmp < 0)
4699 {4700 sock_wfree(sk, buff);
4701 release_sock(sk);
4702 return(-ENETUNREACH);
4703 }4704 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4705 sk->saddr = rt->rt_src;
4706 sk->rcv_saddr = sk->saddr;
4707
4708 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
4709
4710 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4711 buff->seq = sk->write_seq++;
4712 t1->seq = htonl(buff->seq);
4713 sk->sent_seq = sk->write_seq;
4714 buff->end_seq = sk->write_seq;
4715 t1->ack = 0;
4716 t1->window = 2;
4717 t1->res1=0;
4718 t1->res2=0;
4719 t1->rst = 0;
4720 t1->urg = 0;
4721 t1->psh = 0;
4722 t1->syn = 1;
4723 t1->urg_ptr = 0;
4724 t1->doff = 6;
4725 /* use 512 or whatever user asked for */4726
4727 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4728 sk->window_clamp=rt->rt_window;
4729 else4730 sk->window_clamp=0;
4731
4732 if (sk->user_mss)
4733 sk->mtu = sk->user_mss;
4734 elseif (rt)
4735 sk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
4736 else4737 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
4738
4739 /*4740 * but not bigger than device MTU 4741 */4742
4743 if(sk->mtu <32)
4744 sk->mtu = 32; /* Sanity limit */4745
4746 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
4747
4748 #ifdefCONFIG_SKIP4749
4750 /*4751 * SKIP devices set their MTU to 65535. This is so they can take packets4752 * unfragmented to security process then fragment. They could lie to the4753 * TCP layer about a suitable MTU, but its easier to let skip sort it out4754 * simply because the final package we want unfragmented is going to be4755 *4756 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]4757 */4758
4759 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */4760 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4761 #endif4762
4763 /*4764 * Put in the TCP options to say MTU. 4765 */4766
4767 ptr = skb_put(buff,4);
4768 ptr[0] = 2;
4769 ptr[1] = 4;
4770 ptr[2] = (sk->mtu) >> 8;
4771 ptr[3] = (sk->mtu) & 0xff;
4772 tcp_send_check(t1, sk->saddr, sk->daddr,
4773 sizeof(structtcphdr) + 4, sk);
4774
4775 /*4776 * This must go first otherwise a really quick response will get reset. 4777 */4778
4779 tcp_cache_zap();
4780 tcp_set_state(sk,TCP_SYN_SENT);
4781 if(rt&&rt->rt_flags&RTF_IRTT)
4782 sk->rto = rt->rt_irtt;
4783 else4784 sk->rto = TCP_TIMEOUT_INIT;
4785 sk->retransmit_timer.function=&retransmit_timer;
4786 sk->retransmit_timer.data = (unsignedlong)sk;
4787 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4788 sk->retransmits = 0; /* Now works the right way instead of a hacked 4789 initial setting */4790
4791 sk->prot->queue_xmit(sk, dev, buff, 0);
4792 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4793 tcp_statistics.TcpActiveOpens++;
4794 tcp_statistics.TcpOutSegs++;
4795
4796 release_sock(sk);
4797 return(0);
4798 }4799
4800 /*4801 * React to a out-of-window TCP sequence number in an incoming packet4802 */4803 staticvoidbad_tcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4804 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4805 {4806 if (th->rst)
4807 return;
4808
4809 /*4810 * Send a reset if we get something not ours and we are4811 * unsynchronized. Note: We don't do anything to our end. We4812 * are just killing the bogus remote connection then we will4813 * connect again and it will work (with luck).4814 */4815
4816 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4817 {4818 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4819 return;
4820 }4821
4822 /* Try to resync things. */4823 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4824 return;
4825 }4826
4827 /*4828 * This functions checks to see if the tcp header is actually acceptable. 4829 */4830
4831 extern__inline__inttcp_sequence(structsock *sk, u32seq, u32end_seq)
/* */4832 {4833 /* does the packet contain any unseen data AND */4834 /* does the packet start before the window? */4835 returnafter(end_seq+1, sk->acked_seq) &&
4836 before(seq, sk->acked_seq + sk->window + 1);
4837 }4838
4839 /*4840 * When we get a reset we do this.4841 */4842
4843 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4844 {4845 sk->zapped = 1;
4846 sk->err = ECONNRESET;
4847 if (sk->state == TCP_SYN_SENT)
4848 sk->err = ECONNREFUSED;
4849 if (sk->state == TCP_CLOSE_WAIT)
4850 sk->err = EPIPE;
4851 #ifdef TCP_DO_RFC1337
4852 /*4853 * Time wait assassination protection [RFC1337]4854 */4855 if(sk->state!=TCP_TIME_WAIT)
4856 {4857 tcp_set_state(sk,TCP_CLOSE);
4858 sk->shutdown = SHUTDOWN_MASK;
4859 }4860 #else4861 tcp_set_state(sk,TCP_CLOSE);
4862 sk->shutdown = SHUTDOWN_MASK;
4863 #endif4864 if (!sk->dead)
4865 sk->state_change(sk);
4866 kfree_skb(skb, FREE_READ);
4867 release_sock(sk);
4868 return(0);
4869 }4870
4871 /*4872 * Find the socket, using the last hit cache if applicable.4873 */4874 staticinlinestructsock * get_tcp_sock(u32saddr, u16sport, u32daddr, u16dport)
/* */4875 {4876 structsock * sk;
4877
4878 sk = (structsock *) th_cache_sk;
4879 if (saddr != th_cache_saddr || daddr != th_cache_daddr ||
4880 sport != th_cache_sport || dport != th_cache_dport) {4881 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
4882 if (sk) {4883 th_cache_saddr=saddr;
4884 th_cache_daddr=daddr;
4885 th_cache_dport=dport;
4886 th_cache_sport=sport;
4887 th_cache_sk=sk;
4888 }4889 }4890 returnsk;
4891 }4892
4893
4894 /*4895 * A TCP packet has arrived.4896 * skb->h.raw is the TCP header.4897 */4898
4899 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4900 __u32daddr, unsignedshortlen,
4901 __u32saddr, intredo, structinet_protocol * protocol)
4902 {4903 structtcphdr *th;
4904 structsock *sk;
4905 intsyn_ok=0;
4906
4907 /*4908 * "redo" is 1 if we have already seen this skb but couldn't4909 * use it at that time (the socket was locked). In that case4910 * we have already done a lot of the work (looked up the socket4911 * etc).4912 */4913 th = skb->h.th;
4914 sk = skb->sk;
4915 if (!redo) {4916 tcp_statistics.TcpInSegs++;
4917 if (skb->pkt_type!=PACKET_HOST)
4918 {4919 kfree_skb(skb,FREE_READ);
4920 return(0);
4921 }4922 /*4923 * Pull up the IP header.4924 */4925 skb_pull(skb, skb->h.raw-skb->data);
4926 /*4927 * Try to use the device checksum if provided.4928 */4929 if (
4930 ((skb->ip_summed == CHECKSUM_HW) && tcp_check(th, len, saddr, daddr, skb->csum ))||
4931 ((skb->ip_summed == CHECKSUM_NONE) && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4932 /* skip if CHECKSUM_UNNECESSARY */4933 )
4934 {4935 skb->sk = NULL;
4936 kfree_skb(skb,FREE_READ);
4937 /*4938 * We don't release the socket because it was4939 * never marked in use.4940 */4941 return(0);
4942 }4943 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
4944 if (!sk)
4945 gotono_tcp_socket;
4946 skb->sk = sk;
4947 skb->seq = ntohl(th->seq);
4948 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4949 skb->ack_seq = ntohl(th->ack_seq);
4950
4951 skb->acked = 0;
4952 skb->used = 0;
4953 skb->free = 0;
4954 skb->saddr = daddr;
4955 skb->daddr = saddr;
4956
4957 /* We may need to add it to the backlog here. */4958 cli();
4959 if (sk->inuse)
4960 {4961 skb_queue_tail(&sk->back_log, skb);
4962 sti();
4963 return(0);
4964 }4965 sk->inuse = 1;
4966 sti();
4967 }4968
4969 /*4970 * If this socket has got a reset it's to all intents and purposes 4971 * really dead. Count closed sockets as dead.4972 *4973 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4974 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4975 * exist so should cause resets as if the port was unreachable.4976 */4977
4978 if (sk->zapped || sk->state==TCP_CLOSE)
4979 gotono_tcp_socket;
4980
4981 if (!sk->prot)
4982 {4983 printk("IMPOSSIBLE 3\n");
4984 return(0);
4985 }4986
4987
4988 /*4989 * Charge the memory to the socket. 4990 */4991
4992 skb->sk=sk;
4993 sk->rmem_alloc += skb->truesize;
4994
4995 /*4996 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4997 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4998 * compatibility. We also set up variables more thoroughly [Karn notes in the4999 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].5000 */5001
5002 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */5003 {5004
5005 /*5006 * Now deal with unusual cases.5007 */5008
5009 if(sk->state==TCP_LISTEN)
5010 {5011 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */5012 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
5013
5014 /*5015 * We don't care for RST, and non SYN are absorbed (old segments)5016 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the5017 * netmask on a running connection it can go broadcast. Even Sun's have5018 * this problem so I'm ignoring it 5019 */5020
5021 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
5022 {5023 kfree_skb(skb, FREE_READ);
5024 release_sock(sk);
5025 return 0;
5026 }5027
5028 /* 5029 * Guess we need to make a new socket up 5030 */5031
5032 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
5033
5034 /*5035 * Now we have several options: In theory there is nothing else5036 * in the frame. KA9Q has an option to send data with the syn,5037 * BSD accepts data with the syn up to the [to be] advertised window5038 * and Solaris 2.1 gives you a protocol error. For now we just ignore5039 * it, that fits the spec precisely and avoids incompatibilities. It5040 * would be nice in future to drop through and process the data.5041 */5042
5043 release_sock(sk);
5044 return 0;
5045 }5046
5047 /* retransmitted SYN? */5048 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
5049 {5050 kfree_skb(skb, FREE_READ);
5051 release_sock(sk);
5052 return 0;
5053 }5054
5055 /*5056 * SYN sent means we have to look for a suitable ack and either reset5057 * for bad matches or go to connected 5058 */5059
5060 if(sk->state==TCP_SYN_SENT)
5061 {5062 /* Crossed SYN or previous junk segment */5063 if(th->ack)
5064 {5065 /* We got an ack, but it's not a good ack */5066 if(!tcp_ack(sk,th,saddr,len))
5067 {5068 /* Reset the ack - its an ack from a 5069 different connection [ th->rst is checked in tcp_reset()] */5070 tcp_statistics.TcpAttemptFails++;
5071 tcp_reset(daddr, saddr, th,
5072 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5073 kfree_skb(skb, FREE_READ);
5074 release_sock(sk);
5075 return(0);
5076 }5077 if(th->rst)
5078 returntcp_std_reset(sk,skb);
5079 if(!th->syn)
5080 {5081 /* A valid ack from a different connection5082 start. Shouldn't happen but cover it */5083 tcp_statistics.TcpAttemptFails++;
5084 tcp_reset(daddr, saddr, th,
5085 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5086 kfree_skb(skb, FREE_READ);
5087 release_sock(sk);
5088 return 0;
5089 }5090 /*5091 * Ok.. it's good. Set up sequence numbers and5092 * move to established.5093 */5094 syn_ok=1; /* Don't reset this connection for the syn */5095 sk->acked_seq = skb->seq+1;
5096 sk->lastwin_seq = skb->seq+1;
5097 sk->fin_seq = skb->seq;
5098 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5099 tcp_set_state(sk, TCP_ESTABLISHED);
5100 tcp_options(sk,th);
5101 sk->dummy_th.dest=th->source;
5102 sk->copied_seq = sk->acked_seq;
5103 if(!sk->dead)
5104 {5105 sk->state_change(sk);
5106 sock_wake_async(sk->socket, 0);
5107 }5108 if(sk->max_window==0)
5109 {5110 sk->max_window = 32;
5111 sk->mss = min(sk->max_window, sk->mtu);
5112 }5113 }5114 else5115 {5116 /* See if SYN's cross. Drop if boring */5117 if(th->syn && !th->rst)
5118 {5119 /* Crossed SYN's are fine - but talking to5120 yourself is right out... */5121 if(sk->saddr==saddr && sk->daddr==daddr &&
5122 sk->dummy_th.source==th->source &&
5123 sk->dummy_th.dest==th->dest)
5124 {5125 tcp_statistics.TcpAttemptFails++;
5126 returntcp_std_reset(sk,skb);
5127 }5128 tcp_set_state(sk,TCP_SYN_RECV);
5129
5130 /*5131 * FIXME:5132 * Must send SYN|ACK here5133 */5134 }5135 /* Discard junk segment */5136 kfree_skb(skb, FREE_READ);
5137 release_sock(sk);
5138 return 0;
5139 }5140 /*5141 * SYN_RECV with data maybe.. drop through5142 */5143 gotorfc_step6;
5144 }5145
5146 /*5147 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is5148 * a more complex suggestion for fixing these reuse issues in RFC16445149 * but not yet ready for general use. Also see RFC1379.5150 */5151
5152 #defineBSD_TIME_WAIT5153 #ifdefBSD_TIME_WAIT5154 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5155 after(skb->seq, sk->acked_seq) && !th->rst)
5156 {5157 u32seq = sk->write_seq;
5158 if(sk->debug)
5159 printk("Doing a BSD time wait\n");
5160 tcp_statistics.TcpEstabResets++;
5161 sk->rmem_alloc -= skb->truesize;
5162 skb->sk = NULL;
5163 sk->err=ECONNRESET;
5164 tcp_set_state(sk, TCP_CLOSE);
5165 sk->shutdown = SHUTDOWN_MASK;
5166 release_sock(sk);
5167 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5168 if (sk && sk->state==TCP_LISTEN)
5169 {5170 sk->inuse=1;
5171 skb->sk = sk;
5172 sk->rmem_alloc += skb->truesize;
5173 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5174 release_sock(sk);
5175 return 0;
5176 }5177 kfree_skb(skb, FREE_READ);
5178 return 0;
5179 }5180 #endif5181 }5182
5183 /*5184 * We are now in normal data flow (see the step list in the RFC)5185 * Note most of these are inline now. I'll inline the lot when5186 * I have time to test it hard and look at what gcc outputs 5187 */5188
5189 if (!tcp_sequence(sk, skb->seq, skb->end_seq))
5190 {5191 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
5192 kfree_skb(skb, FREE_READ);
5193 release_sock(sk);
5194 return 0;
5195 }5196
5197 if(th->rst)
5198 returntcp_std_reset(sk,skb);
5199
5200 /*5201 * !syn_ok is effectively the state test in RFC793.5202 */5203
5204 if(th->syn && !syn_ok)
5205 {5206 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5207 returntcp_std_reset(sk,skb);
5208 }5209
5210
5211 /*5212 * Delayed ACK time estimator.5213 */5214
5215 if (sk->lrcvtime == 0)
5216 {5217 sk->lrcvtime = jiffies;
5218 sk->ato = HZ/3;
5219 }5220 else5221 {5222 intm;
5223
5224 m = jiffies - sk->lrcvtime;
5225
5226 sk->lrcvtime = jiffies;
5227
5228 if (m <= 0)
5229 m = 1;
5230
5231 if (m > (sk->rtt >> 3))
5232 {5233 sk->ato = sk->rtt >> 3;
5234 /*5235 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);5236 */5237 }5238 else5239 {5240 sk->ato = (sk->ato >> 1) + m;
5241 /*5242 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);5243 */5244 }5245 }5246
5247 /*5248 * Process the ACK5249 */5250
5251
5252 if(th->ack && !tcp_ack(sk,th,saddr,len))
5253 {5254 /*5255 * Our three way handshake failed.5256 */5257
5258 if(sk->state==TCP_SYN_RECV)
5259 {5260 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5261 }5262 kfree_skb(skb, FREE_READ);
5263 release_sock(sk);
5264 return 0;
5265 }5266
5267 rfc_step6: /* I'll clean this up later */5268
5269 /*5270 * If the accepted buffer put us over our queue size we5271 * now drop it (we must process the ack first to avoid5272 * deadlock cases).5273 */5274
5275 if (sk->rmem_alloc >= sk->rcvbuf)
5276 {5277 kfree_skb(skb, FREE_READ);
5278 release_sock(sk);
5279 return(0);
5280 }5281
5282
5283 /*5284 * Process urgent data5285 */5286
5287 if(tcp_urg(sk, th, saddr, len))
5288 {5289 kfree_skb(skb, FREE_READ);
5290 release_sock(sk);
5291 return 0;
5292 }5293
5294 /*5295 * Process the encapsulated data5296 */5297
5298 if(tcp_data(skb,sk, saddr, len))
5299 {5300 kfree_skb(skb, FREE_READ);
5301 release_sock(sk);
5302 return 0;
5303 }5304
5305 /*5306 * And done5307 */5308
5309 release_sock(sk);
5310 return 0;
5311
5312 no_tcp_socket:
5313 /*5314 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)5315 */5316 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
5317 skb->sk = NULL;
5318 /*5319 * Discard frame5320 */5321 kfree_skb(skb, FREE_READ);
5322 return 0;
5323 }5324
5325 /*5326 * This routine sends a packet with an out of date sequence5327 * number. It assumes the other end will try to ack it.5328 */5329
5330 staticvoidtcp_write_wakeup(structsock *sk)
/* */5331 {5332 structsk_buff *buff,*skb;
5333 structtcphdr *t1;
5334 structdevice *dev=NULL;
5335 inttmp;
5336
5337 if (sk->zapped)
5338 return; /* After a valid reset we can send no more */5339
5340 /*5341 * Write data can still be transmitted/retransmitted in the5342 * following states. If any other state is encountered, return.5343 * [listen/close will never occur here anyway]5344 */5345
5346 if (sk->state != TCP_ESTABLISHED &&
5347 sk->state != TCP_CLOSE_WAIT &&
5348 sk->state != TCP_FIN_WAIT1 &&
5349 sk->state != TCP_LAST_ACK &&
5350 sk->state != TCP_CLOSING5351 )
5352 {5353 return;
5354 }5355 if ( before(sk->sent_seq, sk->window_seq) &&
5356 (skb=skb_peek(&sk->write_queue)))
5357 {5358 /*5359 * We are probing the opening of a window5360 * but the window size is != 05361 * must have been a result SWS advoidance ( sender )5362 */5363
5364 structiphdr *iph;
5365 structtcphdr *th;
5366 structtcphdr *nth;
5367 unsignedlongwin_size;
5368 #if 0
5369 unsignedlong ow_size;
5370 #endif5371 void * tcp_data_start;
5372
5373 /*5374 * How many bytes can we send ?5375 */5376
5377 win_size = sk->window_seq - sk->sent_seq;
5378
5379 /*5380 * Recover the buffer pointers5381 */5382
5383 iph = (structiphdr *)skb->ip_hdr;
5384 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
5385
5386 /*5387 * Grab the data for a temporary frame5388 */5389
5390 buff = sock_wmalloc(sk, win_size + th->doff * 4 +
5391 (iph->ihl << 2) +
5392 sk->prot->max_header + 15,
5393 1, GFP_ATOMIC);
5394 if ( buff == NULL )
5395 return;
5396
5397 /* 5398 * If we strip the packet on the write queue we must5399 * be ready to retransmit this one 5400 */5401
5402 buff->free = /*0*/1;
5403
5404 buff->sk = sk;
5405 buff->localroute = sk->localroute;
5406
5407 /*5408 * Put headers on the new packet5409 */5410
5411 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5412 IPPROTO_TCP, sk->opt, buff->truesize,
5413 sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5414 if (tmp < 0)
5415 {5416 sock_wfree(sk, buff);
5417 return;
5418 }5419
5420 /*5421 * Move the TCP header over5422 */5423
5424 buff->dev = dev;
5425
5426 nth = (structtcphdr *) skb_put(buff,th->doff*4);
5427
5428 memcpy(nth, th, th->doff * 4);
5429
5430 /*5431 * Correct the new header5432 */5433
5434 nth->ack = 1;
5435 nth->ack_seq = htonl(sk->acked_seq);
5436 nth->window = htons(tcp_select_window(sk));
5437 nth->check = 0;
5438
5439 /*5440 * Find the first data byte.5441 */5442
5443 tcp_data_start = (char *) th + (th->doff << 2);
5444
5445 /*5446 * Add it to our new buffer5447 */5448
5449 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5450
5451 /*5452 * Remember our right edge sequence number.5453 */5454
5455 buff->end_seq = sk->sent_seq + win_size;
5456 sk->sent_seq = buff->end_seq; /* Hack */5457 if(th->urg && ntohs(th->urg_ptr) < win_size)
5458 nth->urg = 0;
5459
5460 /*5461 * Checksum the split buffer5462 */5463
5464 tcp_send_check(nth, sk->saddr, sk->daddr,
5465 nth->doff * 4 + win_size , sk);
5466 }5467 else5468 {5469 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5470 if (buff == NULL)
5471 return;
5472
5473 buff->free = 1;
5474 buff->sk = sk;
5475 buff->localroute = sk->localroute;
5476
5477 /*5478 * Put in the IP header and routing stuff. 5479 */5480
5481 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5482 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5483 if (tmp < 0)
5484 {5485 sock_wfree(sk, buff);
5486 return;
5487 }5488
5489 t1 = (structtcphdr *)skb_put(buff,sizeof(structtcphdr));
5490 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5491
5492 /*5493 * Use a previous sequence.5494 * This should cause the other end to send an ack.5495 */5496
5497 t1->seq = htonl(sk->sent_seq-1);
5498 t1->ack = 1;
5499 t1->res1= 0;
5500 t1->res2= 0;
5501 t1->rst = 0;
5502 t1->urg = 0;
5503 t1->psh = 0;
5504 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5505 t1->syn = 0;
5506 t1->ack_seq = htonl(sk->acked_seq);
5507 t1->window = htons(tcp_select_window(sk));
5508 t1->doff = sizeof(*t1)/4;
5509 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5510
5511 }5512
5513 /*5514 * Send it.5515 */5516
5517 sk->prot->queue_xmit(sk, dev, buff, 1);
5518 tcp_statistics.TcpOutSegs++;
5519 }5520
5521 /*5522 * A window probe timeout has occurred.5523 */5524
5525 voidtcp_send_probe0(structsock *sk)
/* */5526 {5527 if (sk->zapped)
5528 return; /* After a valid reset we can send no more */5529
5530 tcp_write_wakeup(sk);
5531
5532 sk->backoff++;
5533 sk->rto = min(sk->rto << 1, 120*HZ);
5534 sk->retransmits++;
5535 sk->prot->retransmits ++;
5536 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5537 }5538
5539 /*5540 * Socket option code for TCP. 5541 */5542
5543 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5544 {5545 intval,err;
5546
5547 if(level!=SOL_TCP)
5548 returnip_setsockopt(sk,level,optname,optval,optlen);
5549
5550 if (optval == NULL)
5551 return(-EINVAL);
5552
5553 err=verify_area(VERIFY_READ, optval, sizeof(int));
5554 if(err)
5555 returnerr;
5556
5557 val = get_user((int *)optval);
5558
5559 switch(optname)
5560 {5561 caseTCP_MAXSEG:
5562 /*5563 * values greater than interface MTU won't take effect. however at5564 * the point when this call is done we typically don't yet know5565 * which interface is going to be used5566 */5567 if(val<1||val>MAX_WINDOW)
5568 return -EINVAL;
5569 sk->user_mss=val;
5570 return 0;
5571 caseTCP_NODELAY:
5572 sk->nonagle=(val==0)?0:1;
5573 return 0;
5574 default:
5575 return(-ENOPROTOOPT);
5576 }5577 }5578
5579 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5580 {5581 intval,err;
5582
5583 if(level!=SOL_TCP)
5584 returnip_getsockopt(sk,level,optname,optval,optlen);
5585
5586 switch(optname)
5587 {5588 caseTCP_MAXSEG:
5589 val=sk->user_mss;
5590 break;
5591 caseTCP_NODELAY:
5592 val=sk->nonagle;
5593 break;
5594 default:
5595 return(-ENOPROTOOPT);
5596 }5597 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5598 if(err)
5599 returnerr;
5600 put_user(sizeof(int),(int *) optlen);
5601
5602 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5603 if(err)
5604 returnerr;
5605 put_user(val,(int *)optval);
5606
5607 return(0);
5608 }5609
5610
5611 structprototcp_prot = {5612 tcp_close,
5613 ip_build_header,
5614 tcp_connect,
5615 tcp_accept,
5616 ip_queue_xmit,
5617 tcp_retransmit,
5618 tcp_write_wakeup,
5619 tcp_read_wakeup,
5620 tcp_rcv,
5621 tcp_select,
5622 tcp_ioctl,
5623 NULL,
5624 tcp_shutdown,
5625 tcp_setsockopt,
5626 tcp_getsockopt,
5627 tcp_sendmsg,
5628 tcp_recvmsg,
5629 NULL, /* No special bind() */5630 128,
5631 0,
5632 "TCP",
5633 0, 0,
5634 {NULL,}5635 };