1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications. 178 * Alan Cox : rcv_saddr errors. 179 * Alan Cox : Block double connect(). 180 * Alan Cox : Small hooks for enSKIP. 181 * Alexey Kuznetsov: Path MTU discovery. 182 * Alan Cox : Support soft errors. 183 * Alan Cox : Fix MTU discovery pathalogical case 184 * when the remote claims no mtu! 185 * Marc Tamsky : TCP_CLOSE fix. 186 * 187 * To Fix: 188 * Fast path the code. Two things here - fix the window calculation 189 * so it doesn't iterate over the queue, also spot packets with no funny 190 * options arriving in order and process directly. 191 * 192 * Rewrite output state machine to use a single queue and do low window 193 * situations as per the spec (RFC 1122) 194 * Speed up input assembly algorithm. 195 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 196 * could do with it working on IPv4 197 * User settable/learned rtt/max window/mtu 198 * Fix the window handling to use PR's new code. 199 * 200 * Change the fundamental structure to a single send queue maintained 201 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 202 * active routes too]). Cut the queue off in tcp_retransmit/ 203 * tcp_transmit. 204 * Change the receive queue to assemble as it goes. This lets us 205 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 206 * tcp_data/tcp_read as well as the window shrink crud. 207 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 208 * tcp_queue_skb seem obvious routines to extract. 209 * 210 * This program is free software; you can redistribute it and/or 211 * modify it under the terms of the GNU General Public License 212 * as published by the Free Software Foundation; either version 213 * 2 of the License, or(at your option) any later version. 214 * 215 * Description of States: 216 * 217 * TCP_SYN_SENT sent a connection request, waiting for ack 218 * 219 * TCP_SYN_RECV received a connection request, sent ack, 220 * waiting for final ack in three-way handshake. 221 * 222 * TCP_ESTABLISHED connection established 223 * 224 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 225 * transmission of remaining buffered data 226 * 227 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 228 * to shutdown 229 * 230 * TCP_CLOSING both sides have shutdown but we still have 231 * data we have to finish sending 232 * 233 * TCP_TIME_WAIT timeout to catch resent junk before entering 234 * closed, can only be entered from FIN_WAIT2 235 * or CLOSING. Required because the other end 236 * may not have gotten our last ACK causing it 237 * to retransmit the data packet (which we ignore) 238 * 239 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 240 * us to finish writing our data and to shutdown 241 * (we have to close() to move on to LAST_ACK) 242 * 243 * TCP_LAST_ACK out side has shutdown after remote has 244 * shutdown. There may still be data in our 245 * buffer that we have to finish sending 246 * 247 * TCP_CLOSE socket is finished 248 */ 249
250 /* 251 * RFC1122 status: 252 * NOTE: I'm not going to be doing comments in the code for this one except 253 * for violations and the like. tcp.c is just too big... If I say something 254 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 255 * with Alan. -- MS 950903 256 * 257 * Use of PSH (4.2.2.2) 258 * MAY aggregate data sent without the PSH flag. (does) 259 * MAY queue data received without the PSH flag. (does) 260 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 261 * MAY implement PSH on send calls. (doesn't, thus:) 262 * MUST NOT buffer data indefinitely (doesn't [1 second]) 263 * MUST set PSH on last segment (does) 264 * MAY pass received PSH to application layer (doesn't) 265 * SHOULD send maximum-sized segment whenever possible. (almost always does) 266 * 267 * Window Size (4.2.2.3, 4.2.2.16) 268 * MUST treat window size as an unsigned number (does) 269 * SHOULD treat window size as a 32-bit number (does not) 270 * MUST NOT shrink window once it is offered (does not normally) 271 * 272 * Urgent Pointer (4.2.2.4) 273 * **MUST point urgent pointer to last byte of urgent data (not right 274 * after). (doesn't, to be like BSD) 275 * MUST inform application layer asynchronously of incoming urgent 276 * data. (does) 277 * MUST provide application with means of determining the amount of 278 * urgent data pending. (does) 279 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 280 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 281 * [Follows BSD 1 byte of urgent data] 282 * 283 * TCP Options (4.2.2.5) 284 * MUST be able to receive TCP options in any segment. (does) 285 * MUST ignore unsupported options (does) 286 * 287 * Maximum Segment Size Option (4.2.2.6) 288 * MUST implement both sending and receiving MSS. (does) 289 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send 290 * it always). (does, even when MSS == 536, which is legal) 291 * MUST assume MSS == 536 if no MSS received at connection setup (does) 292 * MUST calculate "effective send MSS" correctly: 293 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 294 * (does - but allows operator override) 295 * 296 * TCP Checksum (4.2.2.7) 297 * MUST generate and check TCP checksum. (does) 298 * 299 * Initial Sequence Number Selection (4.2.2.8) 300 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 301 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 302 * necessary for 10Mbps networks - and harder than BSD to spoof!) 303 * 304 * Simultaneous Open Attempts (4.2.2.10) 305 * MUST support simultaneous open attempts (does) 306 * 307 * Recovery from Old Duplicate SYN (4.2.2.11) 308 * MUST keep track of active vs. passive open (does) 309 * 310 * RST segment (4.2.2.12) 311 * SHOULD allow an RST segment to contain data (does, but doesn't do 312 * anything with it, which is standard) 313 * 314 * Closing a Connection (4.2.2.13) 315 * MUST inform application of whether connectin was closed by RST or 316 * normal close. (does) 317 * MAY allow "half-duplex" close (treat connection as closed for the 318 * local app, even before handshake is done). (does) 319 * MUST linger in TIME_WAIT for 2 * MSL (does) 320 * 321 * Retransmission Timeout (4.2.2.15) 322 * MUST implement Jacobson's slow start and congestion avoidance 323 * stuff. (does) 324 * 325 * Probing Zero Windows (4.2.2.17) 326 * MUST support probing of zero windows. (does) 327 * MAY keep offered window closed indefinitely. (does) 328 * MUST allow remote window to stay closed indefinitely. (does) 329 * 330 * Passive Open Calls (4.2.2.18) 331 * MUST NOT let new passive open affect other connections. (doesn't) 332 * MUST support passive opens (LISTENs) concurrently. (does) 333 * 334 * Time to Live (4.2.2.19) 335 * MUST make TCP TTL configurable. (does - IP_TTL option) 336 * 337 * Event Processing (4.2.2.20) 338 * SHOULD queue out-of-order segments. (does) 339 * MUST aggregate ACK segments whenever possible. (does but badly) 340 * 341 * Retransmission Timeout Calculation (4.2.3.1) 342 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 343 * calculation. (does, or at least explains them in the comments 8*b) 344 * SHOULD initialize RTO to 0 and RTT to 3. (does) 345 * 346 * When to Send an ACK Segment (4.2.3.2) 347 * SHOULD implement delayed ACK. (does not) 348 * MUST keep ACK delay < 0.5 sec. (N/A) 349 * 350 * When to Send a Window Update (4.2.3.3) 351 * MUST implement receiver-side SWS. (does) 352 * 353 * When to Send Data (4.2.3.4) 354 * MUST implement sender-side SWS. (does) 355 * SHOULD implement Nagle algorithm. (does) 356 * 357 * TCP Connection Failures (4.2.3.5) 358 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 359 * SHOULD inform application layer of soft errors. (does) 360 * 361 * TCP Keep-Alives (4.2.3.6) 362 * MAY provide keep-alives. (does) 363 * MUST make keep-alives configurable on a per-connection basis. (does) 364 * MUST default to no keep-alives. (does) 365 * **MUST make keep-alive interval configurable. (doesn't) 366 * **MUST make default keep-alive interval > 2 hours. (doesn't) 367 * MUST NOT interpret failure to ACK keep-alive packet as dead 368 * connection. (doesn't) 369 * SHOULD send keep-alive with no data. (does) 370 * 371 * TCP Multihoming (4.2.3.7) 372 * MUST get source address from IP layer before sending first 373 * SYN. (does) 374 * MUST use same local address for all segments of a connection. (does) 375 * 376 * IP Options (4.2.3.8) 377 * MUST ignore unsupported IP options. (does) 378 * MAY support Time Stamp and Record Route. (does) 379 * MUST allow application to specify a source route. (does) 380 * MUST allow receieved Source Route option to set route for all future 381 * segments on this connection. (does not (security issues)) 382 * 383 * ICMP messages (4.2.3.9) 384 * MUST act on ICMP errors. (does) 385 * MUST slow transmission upon receipt of a Source Quench. (does) 386 * MUST NOT abort connection upon receipt of soft Destination 387 * Unreachables (0, 1, 5), Time Exceededs and Parameter 388 * Problems. (doesn't) 389 * SHOULD report soft Destination Unreachables etc. to the 390 * application. (does) 391 * SHOULD abort connection upon receipt of hard Destination Unreachable 392 * messages (2, 3, 4). (does) 393 * 394 * Remote Address Validation (4.2.3.10) 395 * MUST reject as an error OPEN for invalid remote IP address. (does) 396 * MUST ignore SYN with invalid source address. (does) 397 * MUST silently discard incoming SYN for broadcast/multicast 398 * address. (does) 399 * 400 * Asynchronous Reports (4.2.4.1) 401 * **MUST provide mechanism for reporting soft errors to application 402 * layer. (doesn't) 403 * 404 * Type of Service (4.2.4.2) 405 * MUST allow application layer to set Type of Service. (does IP_TOS) 406 * 407 * (Whew. -- MS 950903) 408 **/ 409
410 #include <linux/types.h>
411 #include <linux/sched.h>
412 #include <linux/mm.h>
413 #include <linux/time.h>
414 #include <linux/string.h>
415 #include <linux/config.h>
416 #include <linux/socket.h>
417 #include <linux/sockios.h>
418 #include <linux/termios.h>
419 #include <linux/in.h>
420 #include <linux/fcntl.h>
421 #include <linux/inet.h>
422 #include <linux/netdevice.h>
423 #include <net/snmp.h>
424 #include <net/ip.h>
425 #include <net/protocol.h>
426 #include <net/icmp.h>
427 #include <net/tcp.h>
428 #include <net/arp.h>
429 #include <linux/skbuff.h>
430 #include <net/sock.h>
431 #include <net/route.h>
432 #include <linux/errno.h>
433 #include <linux/timer.h>
434 #include <asm/system.h>
435 #include <asm/segment.h>
436 #include <linux/mm.h>
437 #include <net/checksum.h>
438
439 /* 440 * The MSL timer is the 'normal' timer. 441 */ 442
443 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
444
445 #define SEQ_TICK 3
446 unsignedlongseq_offset;
447 structtcp_mibtcp_statistics;
448
449 /* 450 * Cached last hit socket 451 */ 452
453 volatileunsignedlongth_cache_saddr,th_cache_daddr;
454 volatileunsignedshortth_cache_dport, th_cache_sport;
455 volatilestructsock *th_cache_sk;
456
457 voidtcp_cache_zap(void)
/* */ 458 { 459 unsignedlongflags;
460 save_flags(flags);
461 cli();
462 th_cache_saddr=0;
463 th_cache_daddr=0;
464 th_cache_dport=0;
465 th_cache_sport=0;
466 th_cache_sk=NULL;
467 restore_flags(flags);
468 } 469
470 staticvoidtcp_close(structsock *sk, inttimeout);
471
472
473 /* 474 * The less said about this the better, but it works and will do for 1.2 475 */ 476
477 staticstructwait_queue *master_select_wakeup;
478
479 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 480 { 481 if (a < b)
482 return(a);
483 return(b);
484 } 485
486 #undefSTATE_TRACE 487
488 #ifdefSTATE_TRACE 489 staticchar *statename[]={ 490 "Unused","Established","Syn Sent","Syn Recv",
491 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
492 "Close Wait","Last ACK","Listen","Closing"
493 };
494 #endif 495
496 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 497 { 498 if(sk->state==TCP_ESTABLISHED)
499 tcp_statistics.TcpCurrEstab--;
500 #ifdefSTATE_TRACE 501 if(sk->debug)
502 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
503 #endif 504 /* This is a hack but it doesn't occur often and it's going to 505 be a real to fix nicely */ 506
507 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
508 { 509 wake_up_interruptible(&master_select_wakeup);
510 } 511 sk->state=state;
512 if(state==TCP_ESTABLISHED)
513 tcp_statistics.TcpCurrEstab++;
514 if(sk->state==TCP_CLOSE)
515 tcp_cache_zap();
516 } 517
518 /* 519 * This routine picks a TCP windows for a socket based on 520 * the following constraints 521 * 522 * 1. The window can never be shrunk once it is offered (RFC 793) 523 * 2. We limit memory per socket 524 * 525 * For now we use NET2E3's heuristic of offering half the memory 526 * we have handy. All is not as bad as this seems however because 527 * of two things. Firstly we will bin packets even within the window 528 * in order to get the data we are waiting for into the memory limit. 529 * Secondly we bin common duplicate forms at receive time 530 * Better heuristics welcome 531 */ 532
533 inttcp_select_window(structsock *sk)
/* */ 534 { 535 intnew_window = sock_rspace(sk);
536
537 if(sk->window_clamp)
538 new_window=min(sk->window_clamp,new_window);
539 /* 540 * Two things are going on here. First, we don't ever offer a 541 * window less than min(sk->mss, MAX_WINDOW/2). This is the 542 * receiver side of SWS as specified in RFC1122. 543 * Second, we always give them at least the window they 544 * had before, in order to avoid retracting window. This 545 * is technically allowed, but RFC1122 advises against it and 546 * in practice it causes trouble. 547 * 548 * Fixme: This doesn't correctly handle the case where 549 * new_window > sk->window but not by enough to allow for the 550 * shift in sequence space. 551 */ 552 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
553 return(sk->window);
554 return(new_window);
555 } 556
557 /* 558 * Find someone to 'accept'. Must be called with 559 * sk->inuse=1 or cli() 560 */ 561
562 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 563 { 564 structsk_buff *p=skb_peek(&s->receive_queue);
565 if(p==NULL)
566 returnNULL;
567 do 568 { 569 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
570 returnp;
571 p=p->next;
572 } 573 while(p!=(structsk_buff *)&s->receive_queue);
574 returnNULL;
575 } 576
577 /* 578 * Remove a completed connection and return it. This is used by 579 * tcp_accept() to get connections from the queue. 580 */ 581
582 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 583 { 584 structsk_buff *skb;
585 unsignedlongflags;
586 save_flags(flags);
587 cli();
588 skb=tcp_find_established(s);
589 if(skb!=NULL)
590 skb_unlink(skb); /* Take it off the queue */ 591 restore_flags(flags);
592 returnskb;
593 } 594
595 /* 596 * This routine closes sockets which have been at least partially 597 * opened, but not yet accepted. Currently it is only called by 598 * tcp_close, and timeout mirrors the value there. 599 */ 600
601 staticvoidtcp_close_pending (structsock *sk)
/* */ 602 { 603 structsk_buff *skb;
604
605 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
606 { 607 skb->sk->dead=1;
608 tcp_close(skb->sk, 0);
609 kfree_skb(skb, FREE_READ);
610 } 611 return;
612 } 613
614 /* 615 * Enter the time wait state. 616 */ 617
618 staticvoidtcp_time_wait(structsock *sk)
/* */ 619 { 620 tcp_set_state(sk,TCP_TIME_WAIT);
621 sk->shutdown = SHUTDOWN_MASK;
622 if (!sk->dead)
623 sk->state_change(sk);
624 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
625 } 626
627 /* 628 * A socket has timed out on its send queue and wants to do a 629 * little retransmitting. Currently this means TCP. 630 */ 631
632 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 633 { 634 structsk_buff * skb;
635 structproto *prot;
636 structdevice *dev;
637 intct=0;
638 structrtable *rt;
639
640 prot = sk->prot;
641 skb = sk->send_head;
642
643 while (skb != NULL)
644 { 645 structtcphdr *th;
646 structiphdr *iph;
647 intsize;
648
649 dev = skb->dev;
650 IS_SKB(skb);
651 skb->when = jiffies;
652
653 /* 654 * Discard the surplus MAC header 655 */ 656
657 skb_pull(skb,((unsignedchar *)skb->ip_hdr)-skb->data);
658
659 /* 660 * In general it's OK just to use the old packet. However we 661 * need to use the current ack and window fields. Urg and 662 * urg_ptr could possibly stand to be updated as well, but we 663 * don't keep the necessary data. That shouldn't be a problem, 664 * if the other end is doing the right thing. Since we're 665 * changing the packet, we have to issue a new IP identifier. 666 */ 667
668 iph = (structiphdr *)skb->data;
669 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
670 size = ntohs(iph->tot_len) - (iph->ihl<<2);
671
672 /* 673 * Note: We ought to check for window limits here but 674 * currently this is done (less efficiently) elsewhere. 675 */ 676
677 /* 678 * Put a MAC header back on (may cause ARPing) 679 */ 680
681 { 682 /* ANK: UGLY, but the bug, that was here, should be fixed. 683 */ 684 structoptions * opt = (structoptions*)skb->proto_priv;
685 rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
686 } 687
688 iph->id = htons(ip_id_count++);
689 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 690 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
691 iph->frag_off &= ~htons(IP_DF);
692 #endif 693 ip_send_check(iph);
694
695 if (rt==NULL) /* Deep poo */ 696 { 697 if(skb->sk)
698 { 699 skb->sk->err_soft=ENETUNREACH;
700 skb->sk->error_report(skb->sk);
701 } 702 } 703 else 704 { 705 dev=rt->rt_dev;
706 skb->raddr=rt->rt_gateway;
707 skb->dev=dev;
708 skb->arp=1;
709 if (rt->rt_hh)
710 { 711 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
712 if (!rt->rt_hh->hh_uptodate)
713 { 714 skb->arp = 0;
715 #ifRT_CACHE_DEBUG >= 2
716 printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
717 #endif 718 } 719 } 720 elseif (dev->hard_header)
721 { 722 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
723 skb->arp=0;
724 } 725
726 /* 727 * This is not the right way to handle this. We have to 728 * issue an up to date window and ack report with this 729 * retransmit to keep the odd buggy tcp that relies on 730 * the fact BSD does this happy. 731 * We don't however need to recalculate the entire 732 * checksum, so someone wanting a small problem to play 733 * with might like to implement RFC1141/RFC1624 and speed 734 * this up by avoiding a full checksum. 735 */ 736
737 th->ack_seq = htonl(sk->acked_seq);
738 th->window = ntohs(tcp_select_window(sk));
739 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
740
741 /* 742 * If the interface is (still) up and running, kick it. 743 */ 744
745 if (dev->flags & IFF_UP)
746 { 747 /* 748 * If the packet is still being sent by the device/protocol 749 * below then don't retransmit. This is both needed, and good - 750 * especially with connected mode AX.25 where it stops resends 751 * occurring of an as yet unsent anyway frame! 752 * We still add up the counts as the round trip time wants 753 * adjusting. 754 */ 755 if (sk && !skb_device_locked(skb))
756 { 757 /* Remove it from any existing driver queue first! */ 758 skb_unlink(skb);
759 /* Now queue it */ 760 ip_statistics.IpOutRequests++;
761 dev_queue_xmit(skb, dev, sk->priority);
762 } 763 } 764 } 765
766 /* 767 * Count retransmissions 768 */ 769
770 ct++;
771 sk->retransmits++;
772 sk->prot->retransmits ++;
773 tcp_statistics.TcpRetransSegs++;
774
775
776 /* 777 * Only one retransmit requested. 778 */ 779
780 if (!all)
781 break;
782
783 /* 784 * This should cut it off before we send too many packets. 785 */ 786
787 if (ct >= sk->cong_window)
788 break;
789 skb = skb->link3;
790 } 791 } 792
793 /* 794 * Reset the retransmission timer 795 */ 796
797 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 798 { 799 del_timer(&sk->retransmit_timer);
800 sk->ip_xmit_timeout = why;
801 if((long)when < 0)
802 { 803 when=3;
804 printk("Error: Negative timer in xmit_timer\n");
805 } 806 sk->retransmit_timer.expires=jiffies+when;
807 add_timer(&sk->retransmit_timer);
808 } 809
810 /* 811 * This is the normal code called for timeouts. It does the retransmission 812 * and then does backoff. tcp_do_retransmit is separated out because 813 * tcp_ack needs to send stuff from the retransmit queue without 814 * initiating a backoff. 815 */ 816
817
818 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 819 { 820 tcp_do_retransmit(sk, all);
821
822 /* 823 * Increase the timeout each time we retransmit. Note that 824 * we do not increase the rtt estimate. rto is initialized 825 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 826 * that doubling rto each time is the least we can get away with. 827 * In KA9Q, Karn uses this for the first few times, and then 828 * goes to quadratic. netBSD doubles, but only goes up to *64, 829 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 830 * defined in the protocol as the maximum possible RTT. I guess 831 * we'll have to use something other than TCP to talk to the 832 * University of Mars. 833 * 834 * PAWS allows us longer timeouts and large windows, so once 835 * implemented ftp to mars will work nicely. We will have to fix 836 * the 120 second clamps though! 837 */ 838
839 sk->retransmits++;
840 sk->prot->retransmits++;
841 sk->backoff++;
842 sk->rto = min(sk->rto << 1, 120*HZ);
843 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
844 } 845
846
847 /* 848 * A timer event has trigger a tcp retransmit timeout. The 849 * socket xmit queue is ready and set up to send. Because 850 * the ack receive code keeps the queue straight we do 851 * nothing clever here. 852 */ 853
854 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 855 { 856 if (all)
857 { 858 tcp_retransmit_time(sk, all);
859 return;
860 } 861
862 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 863 /* sk->ssthresh in theory can be zero. I guess that's OK */ 864 sk->cong_count = 0;
865
866 sk->cong_window = 1;
867
868 /* Do the actual retransmit. */ 869 tcp_retransmit_time(sk, all);
870 } 871
872 /* 873 * A write timeout has occurred. Process the after effects. 874 */ 875
876 staticinttcp_write_timeout(structsock *sk)
/* */ 877 { 878 /* 879 * Look for a 'soft' timeout. 880 */ 881 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
882 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
883 { 884 /* 885 * Attempt to recover if arp has changed (unlikely!) or 886 * a route has shifted (not supported prior to 1.3). 887 */ 888 ip_rt_advice(&sk->ip_route_cache, 0);
889 } 890
891 /* 892 * Have we tried to SYN too many times (repent repent 8)) 893 */ 894
895 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
896 { 897 if(sk->err_soft)
898 sk->err=sk->err_soft;
899 else 900 sk->err=ETIMEDOUT;
901 sk->error_report(sk);
902 del_timer(&sk->retransmit_timer);
903 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ 904 tcp_set_state(sk,TCP_CLOSE);
905 /* Don't FIN, we got nothing back */ 906 release_sock(sk);
907 return 0;
908 } 909 /* 910 * Has it gone just too far ? 911 */ 912 if (sk->retransmits > TCP_RETR2)
913 { 914 if(sk->err_soft)
915 sk->err = sk->err_soft;
916 else 917 sk->err = ETIMEDOUT;
918 sk->error_report(sk);
919 del_timer(&sk->retransmit_timer);
920 /* 921 * Time wait the socket 922 */ 923 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
924 { 925 tcp_set_state(sk,TCP_TIME_WAIT);
926 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
927 } 928 else 929 { 930 /* 931 * Clean up time. 932 */ 933 tcp_set_state(sk, TCP_CLOSE);
934 release_sock(sk);
935 return 0;
936 } 937 } 938 return 1;
939 } 940
941 /* 942 * The TCP retransmit timer. This lacks a few small details. 943 * 944 * 1. An initial rtt timeout on the probe0 should cause what we can 945 * of the first write queue buffer to be split and sent. 946 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 947 * ETIMEDOUT if we know an additional 'soft' error caused this. 948 * tcp_err should save a 'soft error' for us. 949 */ 950
951 staticvoidretransmit_timer(unsignedlongdata)
/* */ 952 { 953 structsock *sk = (structsock*)data;
954 intwhy = sk->ip_xmit_timeout;
955
956 /* 957 * We are reset. We will send no more retransmits. 958 */ 959
960 if(sk->zapped)
961 return;
962
963 /* 964 * Only process if socket is not in use 965 */ 966
967 cli();
968 if (sk->inuse || in_bh)
969 { 970 /* Try again in 1 second */ 971 sk->retransmit_timer.expires = jiffies+HZ;
972 add_timer(&sk->retransmit_timer);
973 sti();
974 return;
975 } 976
977 sk->inuse = 1;
978 sti();
979
980 /* Always see if we need to send an ack. */ 981
982 if (sk->ack_backlog)
983 { 984 sk->prot->read_wakeup (sk);
985 if (! sk->dead)
986 sk->data_ready(sk,0);
987 } 988
989 /* Now we need to figure out why the socket was on the timer. */ 990
991 switch (why)
992 { 993 /* Window probing */ 994 caseTIME_PROBE0:
995 tcp_send_probe0(sk);
996 tcp_write_timeout(sk);
997 break;
998 /* Retransmitting */ 999 caseTIME_WRITE:
1000 /* It could be we got here because we needed to send an ack.1001 * So we need to check for that.1002 */1003 {1004 structsk_buff *skb;
1005 unsignedlongflags;
1006
1007 save_flags(flags);
1008 cli();
1009 skb = sk->send_head;
1010 if (!skb)
1011 {1012 restore_flags(flags);
1013 }1014 else1015 {1016 /*1017 * Kicked by a delayed ack. Reset timer1018 * correctly now1019 */1020 if (jiffies < skb->when + sk->rto)
1021 {1022 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1023 restore_flags(flags);
1024 break;
1025 }1026 restore_flags(flags);
1027 /*1028 * Retransmission1029 */1030 sk->retransmits++;
1031 sk->prot->retransmits++;
1032 sk->prot->retransmit (sk, 0);
1033 tcp_write_timeout(sk);
1034 }1035 break;
1036 }1037 /* Sending Keepalives */1038 caseTIME_KEEPOPEN:
1039 /* 1040 * this reset_timer() call is a hack, this is not1041 * how KEEPOPEN is supposed to work.1042 */1043 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1044
1045 /* Send something to keep the connection open. */1046 if (sk->prot->write_wakeup)
1047 sk->prot->write_wakeup (sk);
1048 sk->retransmits++;
1049 sk->prot->retransmits++;
1050 tcp_write_timeout(sk);
1051 break;
1052 default:
1053 printk ("rexmit_timer: timer expired - reason unknown\n");
1054 break;
1055 }1056 release_sock(sk);
1057 }1058
1059 /*1060 * This routine is called by the ICMP module when it gets some1061 * sort of error condition. If err < 0 then the socket should1062 * be closed and the error returned to the user. If err > 01063 * it's just the icmp type << 8 | icmp code. After adjustment1064 * header points to the first 8 bytes of the tcp header. We need1065 * to find the appropriate port.1066 */1067
1068 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */1069 __u32saddr, structinet_protocol *protocol)
1070 {1071 structtcphdr *th = (structtcphdr *)header;
1072 structsock *sk;
1073
1074 /*1075 * This one is _WRONG_. FIXME urgently.1076 */1077 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1078 structiphdr *iph=(structiphdr *)(header-sizeof(structiphdr));
1079 #endif1080 th =(structtcphdr *)header;
1081 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1082
1083 if (sk == NULL)
1084 return;
1085
1086 if (type == ICMP_SOURCE_QUENCH)
1087 {1088 /*1089 * FIXME:1090 * For now we will just trigger a linear backoff.1091 * The slow start code should cause a real backoff here.1092 */1093 if (sk->cong_window > 4)
1094 sk->cong_window--;
1095 return;
1096 }1097
1098 if (type == ICMP_PARAMETERPROB)
1099 {1100 sk->err=EPROTO;
1101 sk->error_report(sk);
1102 }1103
1104 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1105 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1106 {1107 structrtable * rt;
1108 /*1109 * Ugly trick to pass MTU to protocol layer.1110 * Really we should add argument "info" to error handler.1111 */1112 unsignedshortnew_mtu = ntohs(iph->id);
1113
1114 if ((rt = sk->ip_route_cache) != NULL)
1115 if (rt->rt_mtu > new_mtu)
1116 rt->rt_mtu = new_mtu;
1117
1118 if (sk->mtu > new_mtu - sizeof(structiphdr) - sizeof(structtcphdr)
1119 && new_mtu > sizeof(structiphdr)+sizeof(structtcphdr))
1120 sk->mtu = new_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
1121
1122 return;
1123 }1124 #endif1125
1126 /*1127 * If we've already connected we will keep trying1128 * until we time out, or the user gives up.1129 */1130
1131 if (code < 13)
1132 {1133 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1134 {1135 sk->err = icmp_err_convert[code].errno;
1136 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1137 {1138 tcp_statistics.TcpAttemptFails++;
1139 tcp_set_state(sk,TCP_CLOSE);
1140 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */1141 }1142 }1143 else/* Only an error on timeout */1144 sk->err_soft = icmp_err_convert[code].errno;
1145 }1146 }1147
1148
1149 /*1150 * Walk down the receive queue counting readable data until we hit the end or we find a gap1151 * in the received data queue (ie a frame missing that needs sending to us). Not1152 * sorting using two queues as data arrives makes life so much harder.1153 */1154
1155 staticinttcp_readable(structsock *sk)
/* */1156 {1157 unsignedlongcounted;
1158 unsignedlongamount;
1159 structsk_buff *skb;
1160 intsum;
1161 unsignedlongflags;
1162
1163 if(sk && sk->debug)
1164 printk("tcp_readable: %p - ",sk);
1165
1166 save_flags(flags);
1167 cli();
1168 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1169 {1170 restore_flags(flags);
1171 if(sk && sk->debug)
1172 printk("empty\n");
1173 return(0);
1174 }1175
1176 counted = sk->copied_seq; /* Where we are at the moment */1177 amount = 0;
1178
1179 /* 1180 * Do until a push or until we are out of data. 1181 */1182
1183 do1184 {1185 if (before(counted, skb->seq)) /* Found a hole so stops here */1186 break;
1187 sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */1188 if (skb->h.th->syn)
1189 sum++;
1190 if (sum > 0)
1191 {/* Add it up, move on */1192 amount += sum;
1193 if (skb->h.th->syn)
1194 amount--;
1195 counted += sum;
1196 }1197 /*1198 * Don't count urg data ... but do it in the right place!1199 * Consider: "old_data (ptr is here) URG PUSH data"1200 * The old code would stop at the first push because1201 * it counted the urg (amount==1) and then does amount--1202 * *after* the loop. This means tcp_readable() always1203 * returned zero if any URG PUSH was in the queue, even1204 * though there was normal data available. If we subtract1205 * the urg data right here, we even get it to work for more1206 * than one URG PUSH skb without normal data.1207 * This means that select() finally works now with urg data1208 * in the queue. Note that rlogin was never affected1209 * because it doesn't use select(); it uses two processes1210 * and a blocking read(). And the queue scan in tcp_read()1211 * was correct. Mike <pall@rz.uni-karlsruhe.de>1212 */1213 if (skb->h.th->urg)
1214 amount--; /* don't count urg data */1215 if (amount && skb->h.th->psh) break;
1216 skb = skb->next;
1217 }1218 while(skb != (structsk_buff *)&sk->receive_queue);
1219
1220 restore_flags(flags);
1221 if(sk->debug)
1222 printk("got %lu bytes.\n",amount);
1223 return(amount);
1224 }1225
1226 /*1227 * LISTEN is a special case for select..1228 */1229 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */1230 {1231 if (sel_type == SEL_IN) {1232 intretval;
1233
1234 sk->inuse = 1;
1235 retval = (tcp_find_established(sk) != NULL);
1236 release_sock(sk);
1237 if (!retval)
1238 select_wait(&master_select_wakeup,wait);
1239 returnretval;
1240 }1241 return 0;
1242 }1243
1244
1245 /*1246 * Wait for a TCP event.1247 *1248 * Note that we don't need to set "sk->inuse", as the upper select layers1249 * take care of normal races (between the test and the event) and we don't1250 * go look at any of the socket buffers directly.1251 */1252 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */1253 {1254 if (sk->state == TCP_LISTEN)
1255 returntcp_listen_select(sk, sel_type, wait);
1256
1257 switch(sel_type) {1258 caseSEL_IN:
1259 if (sk->err)
1260 return 1;
1261 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1262 break;
1263
1264 if (sk->shutdown & RCV_SHUTDOWN)
1265 return 1;
1266
1267 if (sk->acked_seq == sk->copied_seq)
1268 break;
1269
1270 if (sk->urg_seq != sk->copied_seq ||
1271 sk->acked_seq != sk->copied_seq+1 ||
1272 sk->urginline || !sk->urg_data)
1273 return 1;
1274 break;
1275
1276 caseSEL_OUT:
1277 if (sk->err)
1278 return 1;
1279 if (sk->shutdown & SEND_SHUTDOWN)
1280 return 0;
1281 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1282 break;
1283 /*1284 * This is now right thanks to a small fix1285 * by Matt Dillon.1286 */1287
1288 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1289 break;
1290 return 1;
1291
1292 caseSEL_EX:
1293 if (sk->urg_data)
1294 return 1;
1295 break;
1296 }1297 select_wait(sk->sleep, wait);
1298 return 0;
1299 }1300
1301 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */1302 {1303 interr;
1304 switch(cmd)
1305 {1306
1307 caseTIOCINQ:
1308 #ifdef FIXME /* FIXME: */1309 caseFIONREAD:
1310 #endif1311 {1312 unsignedlongamount;
1313
1314 if (sk->state == TCP_LISTEN)
1315 return(-EINVAL);
1316
1317 sk->inuse = 1;
1318 amount = tcp_readable(sk);
1319 release_sock(sk);
1320 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1321 if(err)
1322 returnerr;
1323 put_user(amount, (int *)arg);
1324 return(0);
1325 }1326 caseSIOCATMARK:
1327 {1328 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1329
1330 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1331 if (err)
1332 returnerr;
1333 put_user(answ,(int *) arg);
1334 return(0);
1335 }1336 caseTIOCOUTQ:
1337 {1338 unsignedlongamount;
1339
1340 if (sk->state == TCP_LISTEN) return(-EINVAL);
1341 amount = sock_wspace(sk);
1342 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1343 if(err)
1344 returnerr;
1345 put_user(amount, (int *)arg);
1346 return(0);
1347 }1348 default:
1349 return(-EINVAL);
1350 }1351 }1352
1353
1354 /*1355 * This routine computes a TCP checksum. 1356 *1357 * Modified January 1995 from a go-faster DOS routine by1358 * Jorge Cwik <jorge@laser.satlink.net>1359 */1360
1361 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1362 unsignedlongsaddr, unsignedlongdaddr, unsignedlongbase)
1363 {1364 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1365 }1366
1367
1368
1369 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1370 unsignedlongdaddr, intlen, structsock *sk)
1371 {1372 th->check = 0;
1373 th->check = tcp_check(th, len, saddr, daddr,
1374 csum_partial((char *)th,len,0));
1375 return;
1376 }1377
1378 /*1379 * This is the main buffer sending routine. We queue the buffer1380 * having checked it is sane seeming.1381 */1382
1383 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1384 {1385 intsize;
1386 structtcphdr * th = skb->h.th;
1387
1388 /*1389 * length of packet (not counting length of pre-tcp headers) 1390 */1391
1392 size = skb->len - ((unsignedchar *) th - skb->data);
1393
1394 /*1395 * Sanity check it.. 1396 */1397
1398 if (size < sizeof(structtcphdr) || size > skb->len)
1399 {1400 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1401 skb, skb->data, th, skb->len);
1402 kfree_skb(skb, FREE_WRITE);
1403 return;
1404 }1405
1406 /*1407 * If we have queued a header size packet.. (these crash a few1408 * tcp stacks if ack is not set)1409 */1410
1411 if (size == sizeof(structtcphdr))
1412 {1413 /* If it's got a syn or fin it's notionally included in the size..*/1414 if(!th->syn && !th->fin)
1415 {1416 printk("tcp_send_skb: attempt to queue a bogon.\n");
1417 kfree_skb(skb,FREE_WRITE);
1418 return;
1419 }1420 }1421
1422 /*1423 * Actual processing.1424 */1425
1426 tcp_statistics.TcpOutSegs++;
1427 skb->seq = ntohl(th->seq);
1428 skb->end_seq = skb->seq + size - 4*th->doff;
1429
1430 /*1431 * We must queue if1432 *1433 * a) The right edge of this frame exceeds the window1434 * b) We are retransmitting (Nagle's rule)1435 * c) We have too many packets 'in flight'1436 */1437
1438 if (after(skb->end_seq, sk->window_seq) ||
1439 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1440 sk->packets_out >= sk->cong_window)
1441 {1442 /* checksum will be supplied by tcp_write_xmit. So1443 * we shouldn't need to set it at all. I'm being paranoid */1444 th->check = 0;
1445 if (skb->next != NULL)
1446 {1447 printk("tcp_send_partial: next != NULL\n");
1448 skb_unlink(skb);
1449 }1450 skb_queue_tail(&sk->write_queue, skb);
1451
1452 /*1453 * If we don't fit we have to start the zero window1454 * probes. This is broken - we really need to do a partial1455 * send _first_ (This is what causes the Cisco and PC/TCP1456 * grief).1457 */1458
1459 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1460 sk->send_head == NULL && sk->ack_backlog == 0)
1461 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1462 }1463 else1464 {1465 /*1466 * This is going straight out1467 */1468
1469 th->ack_seq = htonl(sk->acked_seq);
1470 th->window = htons(tcp_select_window(sk));
1471
1472 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1473
1474 sk->sent_seq = sk->write_seq;
1475
1476 /*1477 * This is mad. The tcp retransmit queue is put together1478 * by the ip layer. This causes half the problems with1479 * unroutable FIN's and other things.1480 */1481
1482 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1483
1484 /*1485 * Set for next retransmit based on expected ACK time.1486 * FIXME: We set this every time which means our 1487 * retransmits are really about a window behind.1488 */1489
1490 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1491 }1492 }1493
1494 /*1495 * Locking problems lead us to a messy situation where we can have1496 * multiple partially complete buffers queued up. This is really bad1497 * as we don't want to be sending partial buffers. Fix this with1498 * a semaphore or similar to lock tcp_write per socket.1499 *1500 * These routines are pretty self descriptive.1501 */1502
1503 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1504 {1505 structsk_buff * skb;
1506 unsignedlongflags;
1507
1508 save_flags(flags);
1509 cli();
1510 skb = sk->partial;
1511 if (skb) {1512 sk->partial = NULL;
1513 del_timer(&sk->partial_timer);
1514 }1515 restore_flags(flags);
1516 returnskb;
1517 }1518
1519 /*1520 * Empty the partial queue1521 */1522
1523 staticvoidtcp_send_partial(structsock *sk)
/* */1524 {1525 structsk_buff *skb;
1526
1527 if (sk == NULL)
1528 return;
1529 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1530 tcp_send_skb(sk, skb);
1531 }1532
1533 /*1534 * Queue a partial frame1535 */1536
1537 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1538 {1539 structsk_buff * tmp;
1540 unsignedlongflags;
1541
1542 save_flags(flags);
1543 cli();
1544 tmp = sk->partial;
1545 if (tmp)
1546 del_timer(&sk->partial_timer);
1547 sk->partial = skb;
1548 init_timer(&sk->partial_timer);
1549 /*1550 * Wait up to 1 second for the buffer to fill.1551 */1552 sk->partial_timer.expires = jiffies+HZ;
1553 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1554 sk->partial_timer.data = (unsignedlong) sk;
1555 add_timer(&sk->partial_timer);
1556 restore_flags(flags);
1557 if (tmp)
1558 tcp_send_skb(sk, tmp);
1559 }1560
1561
1562 /*1563 * This routine sends an ack and also updates the window. 1564 */1565
1566 staticvoidtcp_send_ack(u32sequence, u32ack,
/* */1567 structsock *sk,
1568 structtcphdr *th, unsignedlongdaddr)
1569 {1570 structsk_buff *buff;
1571 structtcphdr *t1;
1572 structdevice *dev = NULL;
1573 inttmp;
1574
1575 if(sk->zapped)
1576 return; /* We have been reset, we may not send again */1577
1578 /*1579 * We need to grab some memory, and put together an ack,1580 * and then put it into the queue to be sent.1581 */1582
1583 buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1584 if (buff == NULL)
1585 {1586 /* 1587 * Force it to send an ack. We don't have to do this1588 * (ACK is unreliable) but it's much better use of 1589 * bandwidth on slow links to send a spare ack than1590 * resend packets. 1591 */1592
1593 sk->ack_backlog++;
1594 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1595 {1596 reset_xmit_timer(sk, TIME_WRITE, HZ);
1597 }1598 return;
1599 }1600
1601 /*1602 * Assemble a suitable TCP frame1603 */1604
1605 buff->sk = sk;
1606 buff->localroute = sk->localroute;
1607
1608 /* 1609 * Put in the IP header and routing stuff. 1610 */1611
1612 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1613 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1614 if (tmp < 0)
1615 {1616 buff->free = 1;
1617 sock_wfree(sk, buff);
1618 return;
1619 }1620 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1621
1622 memcpy(t1, th, sizeof(*t1));
1623
1624 /*1625 * Swap the send and the receive. 1626 */1627
1628 t1->dest = th->source;
1629 t1->source = th->dest;
1630 t1->seq = ntohl(sequence);
1631 t1->ack = 1;
1632 sk->window = tcp_select_window(sk);
1633 t1->window = ntohs(sk->window);
1634 t1->res1 = 0;
1635 t1->res2 = 0;
1636 t1->rst = 0;
1637 t1->urg = 0;
1638 t1->syn = 0;
1639 t1->psh = 0;
1640 t1->fin = 0;
1641
1642 /*1643 * If we have nothing queued for transmit and the transmit timer1644 * is on we are just doing an ACK timeout and need to switch1645 * to a keepalive.1646 */1647
1648 if (ack == sk->acked_seq)
1649 {1650 sk->ack_backlog = 0;
1651 sk->bytes_rcv = 0;
1652 sk->ack_timed = 0;
1653 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1654 && sk->ip_xmit_timeout == TIME_WRITE)
1655 {1656 if(sk->keepopen) {1657 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1658 }else{1659 delete_timer(sk);
1660 }1661 }1662 }1663
1664 /*1665 * Fill in the packet and send it1666 */1667
1668 t1->ack_seq = htonl(ack);
1669 t1->doff = sizeof(*t1)/4;
1670 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1671 if (sk->debug)
1672 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1673 tcp_statistics.TcpOutSegs++;
1674 sk->prot->queue_xmit(sk, dev, buff, 1);
1675 }1676
1677
1678 /* 1679 * This routine builds a generic TCP header. 1680 */1681
1682 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1683 {1684
1685 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1686 th->seq = htonl(sk->write_seq);
1687 th->psh =(push == 0) ? 1 : 0;
1688 th->doff = sizeof(*th)/4;
1689 th->ack = 1;
1690 th->fin = 0;
1691 sk->ack_backlog = 0;
1692 sk->bytes_rcv = 0;
1693 sk->ack_timed = 0;
1694 th->ack_seq = htonl(sk->acked_seq);
1695 sk->window = tcp_select_window(sk);
1696 th->window = htons(sk->window);
1697
1698 return(sizeof(*th));
1699 }1700
1701 /*1702 * This routine copies from a user buffer into a socket,1703 * and starts the transmit system.1704 */1705
1706 staticinttcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */1707 intlen, intnonblock, intflags)
1708 {1709 intcopied = 0;
1710 intcopy;
1711 inttmp;
1712 intseglen;
1713 intiovct=0;
1714 structsk_buff *skb;
1715 structsk_buff *send_tmp;
1716 structproto *prot;
1717 structdevice *dev = NULL;
1718 unsignedchar *from;
1719
1720 /*1721 * Do sanity checking for sendmsg/sendto/send1722 */1723
1724 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1725 return -EINVAL;
1726 if (msg->msg_name)
1727 {1728 structsockaddr_in *addr=(structsockaddr_in *)msg->msg_name;
1729 if(sk->state == TCP_CLOSE)
1730 return -ENOTCONN;
1731 if (msg->msg_namelen < sizeof(*addr))
1732 return -EINVAL;
1733 if (addr->sin_family && addr->sin_family != AF_INET)
1734 return -EINVAL;
1735 if (addr->sin_port != sk->dummy_th.dest)
1736 return -EISCONN;
1737 if (addr->sin_addr.s_addr != sk->daddr)
1738 return -EISCONN;
1739 }1740
1741 /*1742 * Ok commence sending1743 */1744
1745 while(iovct<msg->msg_iovlen)
1746 {1747 seglen=msg->msg_iov[iovct].iov_len;
1748 from=msg->msg_iov[iovct++].iov_base;
1749 sk->inuse=1;
1750 prot = sk->prot;
1751 while(seglen > 0)
1752 {1753 if (sk->err)
1754 {/* Stop on an error */1755 release_sock(sk);
1756 if (copied)
1757 return(copied);
1758 returnsock_error(sk);
1759 }1760
1761 /*1762 * First thing we do is make sure that we are established. 1763 */1764
1765 if (sk->shutdown & SEND_SHUTDOWN)
1766 {1767 release_sock(sk);
1768 sk->err = EPIPE;
1769 if (copied)
1770 return(copied);
1771 sk->err = 0;
1772 return(-EPIPE);
1773 }1774
1775 /* 1776 * Wait for a connection to finish.1777 */1778
1779 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1780 {1781 if (sk->err)
1782 {1783 release_sock(sk);
1784 if (copied)
1785 return(copied);
1786 returnsock_error(sk);
1787 }1788
1789 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1790 {1791 release_sock(sk);
1792 if (copied)
1793 return(copied);
1794
1795 if (sk->err)
1796 returnsock_error(sk);
1797
1798 if (sk->keepopen)
1799 {1800 send_sig(SIGPIPE, current, 0);
1801 }1802 return(-EPIPE);
1803 }1804
1805 if (nonblock || copied)
1806 {1807 release_sock(sk);
1808 if (copied)
1809 return(copied);
1810 return(-EAGAIN);
1811 }1812
1813 release_sock(sk);
1814 cli();
1815
1816 if (sk->state != TCP_ESTABLISHED &&
1817 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1818 {1819 interruptible_sleep_on(sk->sleep);
1820 if (current->signal & ~current->blocked)
1821 {1822 sti();
1823 if (copied)
1824 return(copied);
1825 return(-ERESTARTSYS);
1826 }1827 }1828 sk->inuse = 1;
1829 sti();
1830 }1831
1832 /*1833 * The following code can result in copy <= if sk->mss is ever1834 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1835 * sk->mtu is constant once SYN processing is finished. I.e. we1836 * had better not get here until we've seen his SYN and at least one1837 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1838 * But ESTABLISHED should guarantee that. sk->max_window is by definition1839 * non-decreasing. Note that any ioctl to set user_mss must be done1840 * before the exchange of SYN's. If the initial ack from the other1841 * end has a window of 0, max_window and thus mss will both be 0.1842 */1843
1844 /* 1845 * Now we need to check if we have a half built packet. 1846 */1847 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1848 /*1849 * FIXME: I'm almost sure that this fragment is BUG,1850 * but it works... I do not know why 8) --ANK1851 *1852 * Really, we should rebuild all the queues...1853 * It's difficult. Temprorary hack is to send all1854 * queued segments with allowed fragmentation.1855 */1856 {1857 intnew_mss = min(sk->mtu, sk->max_window);
1858 if (new_mss < sk->mss)
1859 {1860 tcp_send_partial(sk);
1861 sk->mss = new_mss;
1862 }1863 }1864 #endif1865
1866 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1867 {1868 inthdrlen;
1869
1870 /* IP header + TCP header */1871 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1872 + sizeof(structtcphdr);
1873
1874 /* Add more stuff to the end of skb->len */1875 if (!(flags & MSG_OOB))
1876 {1877 copy = min(sk->mss - (skb->len - hdrlen), seglen);
1878 if (copy <= 0)
1879 {1880 printk("TCP: **bug**: \"copy\" <= 0\n");
1881 return -EFAULT;
1882 }1883 memcpy_fromfs(skb_put(skb,copy), from, copy);
1884 from += copy;
1885 copied += copy;
1886 len -= copy;
1887 sk->write_seq += copy;
1888 seglen -= copy;
1889 }1890 if ((skb->len - hdrlen) >= sk->mss ||
1891 (flags & MSG_OOB) || !sk->packets_out)
1892 tcp_send_skb(sk, skb);
1893 else1894 tcp_enqueue_partial(skb, sk);
1895 continue;
1896 }1897
1898 /*1899 * We also need to worry about the window.1900 * If window < 1/2 the maximum window we've seen from this1901 * host, don't use it. This is sender side1902 * silly window prevention, as specified in RFC1122.1903 * (Note that this is different than earlier versions of1904 * SWS prevention, e.g. RFC813.). What we actually do is 1905 * use the whole MSS. Since the results in the right1906 * edge of the packet being outside the window, it will1907 * be queued for later rather than sent.1908 */1909
1910 copy = sk->window_seq - sk->write_seq;
1911 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1912 copy = sk->mss;
1913 if (copy > seglen)
1914 copy = seglen;
1915
1916 /*1917 * We should really check the window here also. 1918 */1919
1920 send_tmp = NULL;
1921 if (copy < sk->mss && !(flags & MSG_OOB))
1922 {1923 /*1924 * We will release the socket in case we sleep here. 1925 */1926 release_sock(sk);
1927 /*1928 * NB: following must be mtu, because mss can be increased.1929 * mss is always <= mtu 1930 */1931 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1932 sk->inuse = 1;
1933 send_tmp = skb;
1934 }1935 else1936 {1937 /*1938 * We will release the socket in case we sleep here. 1939 */1940 release_sock(sk);
1941 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1942 sk->inuse = 1;
1943 }1944
1945 /*1946 * If we didn't get any memory, we need to sleep. 1947 */1948
1949 if (skb == NULL)
1950 {1951 sk->socket->flags |= SO_NOSPACE;
1952 if (nonblock)
1953 {1954 release_sock(sk);
1955 if (copied)
1956 return(copied);
1957 return(-EAGAIN);
1958 }1959
1960 /*1961 * FIXME: here is another race condition. 1962 */1963
1964 tmp = sk->wmem_alloc;
1965 release_sock(sk);
1966 cli();
1967 /*1968 * Again we will try to avoid it. 1969 */1970 if (tmp <= sk->wmem_alloc &&
1971 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1972 && sk->err == 0)
1973 {1974 sk->socket->flags &= ~SO_NOSPACE;
1975 interruptible_sleep_on(sk->sleep);
1976 if (current->signal & ~current->blocked)
1977 {1978 sti();
1979 if (copied)
1980 return(copied);
1981 return(-ERESTARTSYS);
1982 }1983 }1984 sk->inuse = 1;
1985 sti();
1986 continue;
1987 }1988
1989 skb->sk = sk;
1990 skb->free = 0;
1991 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1992
1993 /*1994 * FIXME: we need to optimize this.1995 * Perhaps some hints here would be good.1996 */1997
1998 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1999 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2000 if (tmp < 0 )
2001 {2002 sock_wfree(sk, skb);
2003 release_sock(sk);
2004 if (copied)
2005 return(copied);
2006 return(tmp);
2007 }2008 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY2009 skb->ip_hdr->frag_off |= htons(IP_DF);
2010 #endif2011 skb->dev = dev;
2012 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
2013 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
2014 if (tmp < 0)
2015 {2016 sock_wfree(sk, skb);
2017 release_sock(sk);
2018 if (copied)
2019 return(copied);
2020 return(tmp);
2021 }2022
2023 if (flags & MSG_OOB)
2024 {2025 skb->h.th->urg = 1;
2026 skb->h.th->urg_ptr = ntohs(copy);
2027 }2028
2029 memcpy_fromfs(skb_put(skb,copy), from, copy);
2030
2031 from += copy;
2032 copied += copy;
2033 len -= copy;
2034 seglen -= copy;
2035 skb->free = 0;
2036 sk->write_seq += copy;
2037
2038 if (send_tmp != NULL && sk->packets_out)
2039 {2040 tcp_enqueue_partial(send_tmp, sk);
2041 continue;
2042 }2043 tcp_send_skb(sk, skb);
2044 }2045 }2046 sk->err = 0;
2047
2048 /*2049 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly2050 * interactive fast network servers. It's meant to be on and2051 * it really improves the throughput though not the echo time2052 * on my slow slip link - Alan2053 */2054
2055 /*2056 * Avoid possible race on send_tmp - c/o Johannes Stille 2057 */2058
2059 if(sk->partial && ((!sk->packets_out)
2060 /* If not nagling we can send on the before case too.. */2061 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2062 ))
2063 tcp_send_partial(sk);
2064
2065 release_sock(sk);
2066 return(copied);
2067 }2068
2069 /*2070 * Send an ack if one is backlogged at this point. Ought to merge2071 * this with tcp_send_ack().2072 */2073
2074 staticvoidtcp_read_wakeup(structsock *sk)
/* */2075 {2076 inttmp;
2077 structdevice *dev = NULL;
2078 structtcphdr *t1;
2079 structsk_buff *buff;
2080
2081 if (!sk->ack_backlog)
2082 return;
2083
2084 /*2085 * If we're closed, don't send an ack, or we'll get a RST2086 * from the closed destination.2087 */2088 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2089 return;
2090
2091 /*2092 * FIXME: we need to put code here to prevent this routine from2093 * being called. Being called once in a while is ok, so only check2094 * if this is the second time in a row.2095 */2096
2097 /*2098 * We need to grab some memory, and put together an ack,2099 * and then put it into the queue to be sent.2100 */2101
2102 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2103 if (buff == NULL)
2104 {2105 /* Try again real soon. */2106 reset_xmit_timer(sk, TIME_WRITE, HZ);
2107 return;
2108 }2109
2110 buff->sk = sk;
2111 buff->localroute = sk->localroute;
2112
2113 /*2114 * Put in the IP header and routing stuff. 2115 */2116
2117 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2118 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2119 if (tmp < 0)
2120 {2121 buff->free = 1;
2122 sock_wfree(sk, buff);
2123 return;
2124 }2125
2126 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2127
2128 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2129 t1->seq = htonl(sk->sent_seq);
2130 t1->ack = 1;
2131 t1->res1 = 0;
2132 t1->res2 = 0;
2133 t1->rst = 0;
2134 t1->urg = 0;
2135 t1->syn = 0;
2136 t1->psh = 0;
2137 sk->ack_backlog = 0;
2138 sk->bytes_rcv = 0;
2139 sk->window = tcp_select_window(sk);
2140 t1->window = htons(sk->window);
2141 t1->ack_seq = htonl(sk->acked_seq);
2142 t1->doff = sizeof(*t1)/4;
2143 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2144 sk->prot->queue_xmit(sk, dev, buff, 1);
2145 tcp_statistics.TcpOutSegs++;
2146 }2147
2148
2149 /*2150 * FIXME:2151 * This routine frees used buffers.2152 * It should consider sending an ACK to let the2153 * other end know we now have a bigger window.2154 */2155
2156 staticvoidcleanup_rbuf(structsock *sk)
/* */2157 {2158 unsignedlongflags;
2159 unsignedlongleft;
2160 structsk_buff *skb;
2161 unsignedlongrspace;
2162
2163 if(sk->debug)
2164 printk("cleaning rbuf for sk=%p\n", sk);
2165
2166 save_flags(flags);
2167 cli();
2168
2169 left = sock_rspace(sk);
2170
2171 /*2172 * We have to loop through all the buffer headers,2173 * and try to free up all the space we can.2174 */2175
2176 while((skb=skb_peek(&sk->receive_queue)) != NULL)
2177 {2178 if (!skb->used || skb->users)
2179 break;
2180 skb_unlink(skb);
2181 skb->sk = sk;
2182 kfree_skb(skb, FREE_READ);
2183 }2184
2185 restore_flags(flags);
2186
2187 /*2188 * FIXME:2189 * At this point we should send an ack if the difference2190 * in the window, and the amount of space is bigger than2191 * TCP_WINDOW_DIFF.2192 */2193
2194 if(sk->debug)
2195 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2196 left);
2197 if ((rspace=sock_rspace(sk)) != left)
2198 {2199 /*2200 * This area has caused the most trouble. The current strategy2201 * is to simply do nothing if the other end has room to send at2202 * least 3 full packets, because the ack from those will auto-2203 * matically update the window. If the other end doesn't think2204 * we have much space left, but we have room for at least 1 more2205 * complete packet than it thinks we do, we will send an ack2206 * immediately. Otherwise we will wait up to .5 seconds in case2207 * the user reads some more.2208 */2209 sk->ack_backlog++;
2210 /*2211 * It's unclear whether to use sk->mtu or sk->mss here. They differ only2212 * if the other end is offering a window smaller than the agreed on MSS2213 * (called sk->mtu here). In theory there's no connection between send2214 * and receive, and so no reason to think that they're going to send2215 * small packets. For the moment I'm using the hack of reducing the mss2216 * only on the send side, so I'm putting mtu here.2217 */2218
2219 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2220 {2221 /* Send an ack right now. */2222 tcp_read_wakeup(sk);
2223 }2224 else2225 {2226 /* Force it to send an ack soon. */2227 intwas_active = del_timer(&sk->retransmit_timer);
2228 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2229 {2230 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2231 }2232 else2233 add_timer(&sk->retransmit_timer);
2234 }2235 }2236 }2237
2238
2239 /*2240 * Handle reading urgent data. BSD has very simple semantics for2241 * this, no blocking and very strange errors 8)2242 */2243
2244 staticinttcp_recv_urg(structsock * sk, intnonblock,
/* */2245 structmsghdr *msg, intlen, intflags, int *addr_len)
2246 {2247 /*2248 * No URG data to read2249 */2250 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2251 return -EINVAL; /* Yes this is right ! */2252
2253 if (sk->err)
2254 returnsock_error(sk);
2255
2256 if (sk->state == TCP_CLOSE || sk->done)
2257 {2258 if (!sk->done)
2259 {2260 sk->done = 1;
2261 return 0;
2262 }2263 return -ENOTCONN;
2264 }2265
2266 if (sk->shutdown & RCV_SHUTDOWN)
2267 {2268 sk->done = 1;
2269 return 0;
2270 }2271 sk->inuse = 1;
2272 if (sk->urg_data & URG_VALID)
2273 {2274 charc = sk->urg_data;
2275 if (!(flags & MSG_PEEK))
2276 sk->urg_data = URG_READ;
2277 memcpy_toiovec(msg->msg_iov, &c, 1);
2278 if(msg->msg_name)
2279 {2280 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2281 sin->sin_family=AF_INET;
2282 sin->sin_addr.s_addr=sk->daddr;
2283 sin->sin_port=sk->dummy_th.dest;
2284 }2285 if(addr_len)
2286 *addr_len=sizeof(structsockaddr_in);
2287 release_sock(sk);
2288 return 1;
2289 }2290 release_sock(sk);
2291
2292 /*2293 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2294 * the available implementations agree in this case:2295 * this call should never block, independent of the2296 * blocking state of the socket.2297 * Mike <pall@rz.uni-karlsruhe.de>2298 */2299 return -EAGAIN;
2300 }2301
2302
2303 /*2304 * This routine copies from a sock struct into the user buffer. 2305 */2306
2307 staticinttcp_recvmsg(structsock *sk, structmsghdr *msg,
/* */2308 intlen, intnonblock, intflags, int *addr_len)
2309 {2310 structwait_queuewait = {current, NULL};
2311 intcopied = 0;
2312 u32peek_seq;
2313 volatileu32 *seq; /* So gcc doesn't overoptimise */2314 unsignedlongused;
2315
2316 /* 2317 * This error should be checked. 2318 */2319
2320 if (sk->state == TCP_LISTEN)
2321 return -ENOTCONN;
2322
2323 /*2324 * Urgent data needs to be handled specially. 2325 */2326
2327 if (flags & MSG_OOB)
2328 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2329
2330 /*2331 * Copying sequence to update. This is volatile to handle2332 * the multi-reader case neatly (memcpy_to/fromfs might be 2333 * inline and thus not flush cached variables otherwise).2334 */2335
2336 peek_seq = sk->copied_seq;
2337 seq = &sk->copied_seq;
2338 if (flags & MSG_PEEK)
2339 seq = &peek_seq;
2340
2341 add_wait_queue(sk->sleep, &wait);
2342 sk->inuse = 1;
2343 while (len > 0)
2344 {2345 structsk_buff * skb;
2346 u32offset;
2347
2348 /*2349 * Are we at urgent data? Stop if we have read anything.2350 */2351
2352 if (copied && sk->urg_data && sk->urg_seq == *seq)
2353 break;
2354
2355 /*2356 * Next get a buffer.2357 */2358
2359 current->state = TASK_INTERRUPTIBLE;
2360
2361 skb = skb_peek(&sk->receive_queue);
2362 do2363 {2364 if (!skb)
2365 break;
2366 if (before(*seq, skb->seq))
2367 break;
2368 offset = *seq - skb->seq;
2369 if (skb->h.th->syn)
2370 offset--;
2371 if (offset < skb->len)
2372 gotofound_ok_skb;
2373 if (skb->h.th->fin)
2374 gotofound_fin_ok;
2375 if (!(flags & MSG_PEEK))
2376 skb->used = 1;
2377 skb = skb->next;
2378 }2379 while (skb != (structsk_buff *)&sk->receive_queue);
2380
2381 if (copied)
2382 break;
2383
2384 if (sk->err)
2385 {2386 copied = sock_error(sk);
2387 break;
2388 }2389
2390 if (sk->state == TCP_CLOSE)
2391 {2392 if (!sk->done)
2393 {2394 sk->done = 1;
2395 break;
2396 }2397 copied = -ENOTCONN;
2398 break;
2399 }2400
2401 if (sk->shutdown & RCV_SHUTDOWN)
2402 {2403 sk->done = 1;
2404 break;
2405 }2406
2407 if (nonblock)
2408 {2409 copied = -EAGAIN;
2410 break;
2411 }2412
2413 cleanup_rbuf(sk);
2414 release_sock(sk);
2415 sk->socket->flags |= SO_WAITDATA;
2416 schedule();
2417 sk->socket->flags &= ~SO_WAITDATA;
2418 sk->inuse = 1;
2419
2420 if (current->signal & ~current->blocked)
2421 {2422 copied = -ERESTARTSYS;
2423 break;
2424 }2425 continue;
2426
2427 found_ok_skb:
2428 /*2429 * Lock the buffer. We can be fairly relaxed as2430 * an interrupt will never steal a buffer we are 2431 * using unless I've missed something serious in2432 * tcp_data.2433 */2434
2435 skb->users++;
2436
2437 /*2438 * Ok so how much can we use ? 2439 */2440
2441 used = skb->len - offset;
2442 if (len < used)
2443 used = len;
2444 /*2445 * Do we have urgent data here? 2446 */2447
2448 if (sk->urg_data)
2449 {2450 u32urg_offset = sk->urg_seq - *seq;
2451 if (urg_offset < used)
2452 {2453 if (!urg_offset)
2454 {2455 if (!sk->urginline)
2456 {2457 ++*seq;
2458 offset++;
2459 used--;
2460 }2461 }2462 else2463 used = urg_offset;
2464 }2465 }2466
2467 /*2468 * Copy it - We _MUST_ update *seq first so that we2469 * don't ever double read when we have dual readers2470 */2471
2472 *seq += used;
2473
2474 /*2475 * This memcpy_tofs can sleep. If it sleeps and we2476 * do a second read it relies on the skb->users to avoid2477 * a crash when cleanup_rbuf() gets called.2478 */2479
2480 memcpy_toiovec(msg->msg_iov,((unsignedchar *)skb->h.th) +
2481 skb->h.th->doff*4 + offset, used);
2482 copied += used;
2483 len -= used;
2484
2485 /*2486 * We now will not sleep again until we are finished2487 * with skb. Sorry if you are doing the SMP port2488 * but you'll just have to fix it neatly ;)2489 */2490
2491 skb->users --;
2492
2493 if (after(sk->copied_seq,sk->urg_seq))
2494 sk->urg_data = 0;
2495 if (used + offset < skb->len)
2496 continue;
2497
2498 /*2499 * Process the FIN.2500 */2501
2502 if (skb->h.th->fin)
2503 gotofound_fin_ok;
2504 if (flags & MSG_PEEK)
2505 continue;
2506 skb->used = 1;
2507 continue;
2508
2509 found_fin_ok:
2510 ++*seq;
2511 if (flags & MSG_PEEK)
2512 break;
2513
2514 /*2515 * All is done2516 */2517
2518 skb->used = 1;
2519 sk->shutdown |= RCV_SHUTDOWN;
2520 break;
2521
2522 }2523
2524 if(copied>0 && msg->msg_name)
2525 {2526 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2527 sin->sin_family=AF_INET;
2528 sin->sin_addr.s_addr=sk->daddr;
2529 sin->sin_port=sk->dummy_th.dest;
2530 }2531 if(addr_len)
2532 *addr_len=sizeof(structsockaddr_in);
2533
2534 remove_wait_queue(sk->sleep, &wait);
2535 current->state = TASK_RUNNING;
2536
2537 /* Clean up data we have read: This will do ACK frames */2538 cleanup_rbuf(sk);
2539 release_sock(sk);
2540 returncopied;
2541 }2542
2543
2544
2545 /*2546 * State processing on a close. This implements the state shift for2547 * sending our FIN frame. Note that we only send a FIN for some 2548 * states. A shutdown() may have already sent the FIN, or we may be2549 * closed.2550 */2551
2552 staticinttcp_close_state(structsock *sk, intdead)
/* */2553 {2554 intns=TCP_CLOSE;
2555 intsend_fin=0;
2556 switch(sk->state)
2557 {2558 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2559 break;
2560 caseTCP_SYN_RECV:
2561 caseTCP_ESTABLISHED: /* Closedown begin */2562 ns=TCP_FIN_WAIT1;
2563 send_fin=1;
2564 break;
2565 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2566 caseTCP_FIN_WAIT2:
2567 caseTCP_CLOSING:
2568 ns=sk->state;
2569 break;
2570 caseTCP_CLOSE:
2571 caseTCP_LISTEN:
2572 break;
2573 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2574 wait only for the ACK */2575 ns=TCP_LAST_ACK;
2576 send_fin=1;
2577 }2578
2579 tcp_set_state(sk,ns);
2580
2581 /*2582 * This is a (useful) BSD violating of the RFC. There is a2583 * problem with TCP as specified in that the other end could2584 * keep a socket open forever with no application left this end.2585 * We use a 3 minute timeout (about the same as BSD) then kill2586 * our end. If they send after that then tough - BUT: long enough2587 * that we won't make the old 4*rto = almost no time - whoops2588 * reset mistake.2589 */2590 if(dead && ns==TCP_FIN_WAIT2)
2591 {2592 inttimer_active=del_timer(&sk->timer);
2593 if(timer_active)
2594 add_timer(&sk->timer);
2595 else2596 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2597 }2598
2599 returnsend_fin;
2600 }2601
2602 /*2603 * Send a fin.2604 */2605
2606 staticvoidtcp_send_fin(structsock *sk)
/* */2607 {2608 structproto *prot =(structproto *)sk->prot;
2609 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2610 structtcphdr *t1;
2611 structsk_buff *buff;
2612 structdevice *dev=NULL;
2613 inttmp;
2614
2615 release_sock(sk); /* in case the malloc sleeps. */2616
2617 buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2618 sk->inuse = 1;
2619
2620 if (buff == NULL)
2621 {2622 /* This is a disaster if it occurs */2623 printk("tcp_send_fin: Impossible malloc failure");
2624 return;
2625 }2626
2627 /*2628 * Administrivia2629 */2630
2631 buff->sk = sk;
2632 buff->localroute = sk->localroute;
2633
2634 /*2635 * Put in the IP header and routing stuff. 2636 */2637
2638 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2639 IPPROTO_TCP, sk->opt,
2640 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2641 if (tmp < 0)
2642 {2643 intt;
2644 /*2645 * Finish anyway, treat this as a send that got lost. 2646 * (Not good).2647 */2648
2649 buff->free = 1;
2650 sock_wfree(sk,buff);
2651 sk->write_seq++;
2652 t=del_timer(&sk->timer);
2653 if(t)
2654 add_timer(&sk->timer);
2655 else2656 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2657 return;
2658 }2659
2660 /*2661 * We ought to check if the end of the queue is a buffer and2662 * if so simply add the fin to that buffer, not send it ahead.2663 */2664
2665 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2666 buff->dev = dev;
2667 memcpy(t1, th, sizeof(*t1));
2668 buff->seq = sk->write_seq;
2669 sk->write_seq++;
2670 buff->end_seq = sk->write_seq;
2671 t1->seq = htonl(buff->seq);
2672 t1->ack = 1;
2673 t1->ack_seq = htonl(sk->acked_seq);
2674 t1->window = htons(sk->window=tcp_select_window(sk));
2675 t1->fin = 1;
2676 t1->rst = 0;
2677 t1->doff = sizeof(*t1)/4;
2678 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2679
2680 /*2681 * If there is data in the write queue, the fin must be appended to2682 * the write queue.2683 */2684
2685 if (skb_peek(&sk->write_queue) != NULL)
2686 {2687 buff->free = 0;
2688 if (buff->next != NULL)
2689 {2690 printk("tcp_send_fin: next != NULL\n");
2691 skb_unlink(buff);
2692 }2693 skb_queue_tail(&sk->write_queue, buff);
2694 }2695 else2696 {2697 sk->sent_seq = sk->write_seq;
2698 sk->prot->queue_xmit(sk, dev, buff, 0);
2699 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2700 }2701 }2702
2703 /*2704 * Shutdown the sending side of a connection. Much like close except2705 * that we don't receive shut down or set sk->dead=1.2706 */2707
2708 voidtcp_shutdown(structsock *sk, inthow)
/* */2709 {2710 /*2711 * We need to grab some memory, and put together a FIN,2712 * and then put it into the queue to be sent.2713 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2714 */2715
2716 if (!(how & SEND_SHUTDOWN))
2717 return;
2718
2719 /*2720 * If we've already sent a FIN, or it's a closed state2721 */2722
2723 if (sk->state == TCP_FIN_WAIT1 ||
2724 sk->state == TCP_FIN_WAIT2 ||
2725 sk->state == TCP_CLOSING ||
2726 sk->state == TCP_LAST_ACK ||
2727 sk->state == TCP_TIME_WAIT ||
2728 sk->state == TCP_CLOSE ||
2729 sk->state == TCP_LISTEN2730 )
2731 {2732 return;
2733 }2734 sk->inuse = 1;
2735
2736 /*2737 * flag that the sender has shutdown2738 */2739
2740 sk->shutdown |= SEND_SHUTDOWN;
2741
2742 /*2743 * Clear out any half completed packets. 2744 */2745
2746 if (sk->partial)
2747 tcp_send_partial(sk);
2748
2749 /*2750 * FIN if needed2751 */2752
2753 if(tcp_close_state(sk,0))
2754 tcp_send_fin(sk);
2755
2756 release_sock(sk);
2757 }2758
2759 /*2760 * This routine will send an RST to the other tcp. 2761 */2762
2763 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2764 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2765 {2766 structsk_buff *buff;
2767 structtcphdr *t1;
2768 inttmp;
2769 structdevice *ndev=NULL;
2770
2771 /*2772 * Cannot reset a reset (Think about it).2773 */2774
2775 if(th->rst)
2776 return;
2777
2778 /*2779 * We need to grab some memory, and put together an RST,2780 * and then put it into the queue to be sent.2781 */2782
2783 buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2784 if (buff == NULL)
2785 return;
2786
2787 buff->sk = NULL;
2788 buff->dev = dev;
2789 buff->localroute = 0;
2790
2791 /*2792 * Put in the IP header and routing stuff. 2793 */2794
2795 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2796 sizeof(structtcphdr),tos,ttl,NULL);
2797 if (tmp < 0)
2798 {2799 buff->free = 1;
2800 sock_wfree(NULL, buff);
2801 return;
2802 }2803
2804 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2805 memcpy(t1, th, sizeof(*t1));
2806
2807 /*2808 * Swap the send and the receive. 2809 */2810
2811 t1->dest = th->source;
2812 t1->source = th->dest;
2813 t1->rst = 1;
2814 t1->window = 0;
2815
2816 if(th->ack)
2817 {2818 t1->ack = 0;
2819 t1->seq = th->ack_seq;
2820 t1->ack_seq = 0;
2821 }2822 else2823 {2824 t1->ack = 1;
2825 if(!th->syn)
2826 t1->ack_seq = th->seq;
2827 else2828 t1->ack_seq = htonl(ntohl(th->seq)+1);
2829 t1->seq = 0;
2830 }2831
2832 t1->syn = 0;
2833 t1->urg = 0;
2834 t1->fin = 0;
2835 t1->psh = 0;
2836 t1->doff = sizeof(*t1)/4;
2837 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2838 prot->queue_xmit(NULL, ndev, buff, 1);
2839 tcp_statistics.TcpOutSegs++;
2840 }2841
2842
2843 /*2844 * Look for tcp options. Parses everything but only knows about MSS.2845 * This routine is always called with the packet containing the SYN.2846 * However it may also be called with the ack to the SYN. So you2847 * can't assume this is always the SYN. It's always called after2848 * we have set up sk->mtu to our own MTU.2849 *2850 * We need at minimum to add PAWS support here. Possibly large windows2851 * as Linux gets deployed on 100Mb/sec networks.2852 */2853
2854 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2855 {2856 unsignedchar *ptr;
2857 intlength=(th->doff*4)-sizeof(structtcphdr);
2858 intmss_seen = 0;
2859
2860 ptr = (unsignedchar *)(th + 1);
2861
2862 while(length>0)
2863 {2864 intopcode=*ptr++;
2865 intopsize=*ptr++;
2866 switch(opcode)
2867 {2868 caseTCPOPT_EOL:
2869 return;
2870 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2871 length--;
2872 ptr--; /* the opsize=*ptr++ above was a mistake */2873 continue;
2874
2875 default:
2876 if(opsize<=2) /* Avoid silly options looping forever */2877 return;
2878 switch(opcode)
2879 {2880 caseTCPOPT_MSS:
2881 if(opsize==4 && th->syn)
2882 {2883 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2884 mss_seen = 1;
2885 }2886 break;
2887 /* Add other options here as people feel the urge to implement stuff like large windows */2888 }2889 ptr+=opsize-2;
2890 length-=opsize;
2891 }2892 }2893 if (th->syn)
2894 {2895 if (! mss_seen)
2896 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2897 }2898 #ifdefCONFIG_INET_PCTCP2899 sk->mss = min(sk->max_window >> 1, sk->mtu);
2900 #else2901 sk->mss = min(sk->max_window, sk->mtu);
2902 #endif2903 }2904
2905 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2906 {2907 dst = ntohl(dst);
2908 if (IN_CLASSA(dst))
2909 returnhtonl(IN_CLASSA_NET);
2910 if (IN_CLASSB(dst))
2911 returnhtonl(IN_CLASSB_NET);
2912 returnhtonl(IN_CLASSC_NET);
2913 }2914
2915 /*2916 * Default sequence number picking algorithm.2917 * As close as possible to RFC 793, which2918 * suggests using a 250kHz clock.2919 * Further reading shows this assumes 2MB/s networks.2920 * For 10MB/s ethernet, a 1MHz clock is appropriate.2921 * That's funny, Linux has one built in! Use it!2922 */2923
2924 externinlineu32tcp_init_seq(void)
/* */2925 {2926 structtimevaltv;
2927 do_gettimeofday(&tv);
2928 returntv.tv_usec+tv.tv_sec*1000000;
2929 }2930
2931 /*2932 * This routine handles a connection request.2933 * It should make sure we haven't already responded.2934 * Because of the way BSD works, we have to send a syn/ack now.2935 * This also means it will be harder to close a socket which is2936 * listening.2937 */2938
2939 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2940 unsignedlongdaddr, unsignedlongsaddr,
2941 structoptions *opt, structdevice *dev, u32seq)
2942 {2943 structsk_buff *buff;
2944 structtcphdr *t1;
2945 unsignedchar *ptr;
2946 structsock *newsk;
2947 structtcphdr *th;
2948 structdevice *ndev=NULL;
2949 inttmp;
2950 structrtable *rt;
2951
2952 th = skb->h.th;
2953
2954 /* If the socket is dead, don't accept the connection. */2955 if (!sk->dead)
2956 {2957 sk->data_ready(sk,0);
2958 }2959 else2960 {2961 if(sk->debug)
2962 printk("Reset on %p: Connect on dead socket.\n",sk);
2963 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2964 tcp_statistics.TcpAttemptFails++;
2965 kfree_skb(skb, FREE_READ);
2966 return;
2967 }2968
2969 /*2970 * Make sure we can accept more. This will prevent a2971 * flurry of syns from eating up all our memory.2972 */2973
2974 if (sk->ack_backlog >= sk->max_ack_backlog)
2975 {2976 tcp_statistics.TcpAttemptFails++;
2977 kfree_skb(skb, FREE_READ);
2978 return;
2979 }2980
2981 /*2982 * We need to build a new sock struct.2983 * It is sort of bad to have a socket without an inode attached2984 * to it, but the wake_up's will just wake up the listening socket,2985 * and if the listening socket is destroyed before this is taken2986 * off of the queue, this will take care of it.2987 */2988
2989 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2990 if (newsk == NULL)
2991 {2992 /* just ignore the syn. It will get retransmitted. */2993 tcp_statistics.TcpAttemptFails++;
2994 kfree_skb(skb, FREE_READ);
2995 return;
2996 }2997
2998 memcpy(newsk, sk, sizeof(*newsk));
2999 newsk->opt = NULL;
3000 newsk->ip_route_cache = NULL;
3001 if (opt && opt->optlen) {3002 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
3003 if (!sk->opt) {3004 kfree_s(newsk, sizeof(structsock));
3005 tcp_statistics.TcpAttemptFails++;
3006 kfree_skb(skb, FREE_READ);
3007 return;
3008 }3009 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {3010 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
3011 kfree_s(newsk, sizeof(structsock));
3012 tcp_statistics.TcpAttemptFails++;
3013 kfree_skb(skb, FREE_READ);
3014 return;
3015 }3016 }3017 skb_queue_head_init(&newsk->write_queue);
3018 skb_queue_head_init(&newsk->receive_queue);
3019 newsk->send_head = NULL;
3020 newsk->send_tail = NULL;
3021 skb_queue_head_init(&newsk->back_log);
3022 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/3023 newsk->rto = TCP_TIMEOUT_INIT;
3024 newsk->mdev = 0;
3025 newsk->max_window = 0;
3026 newsk->cong_window = 1;
3027 newsk->cong_count = 0;
3028 newsk->ssthresh = 0;
3029 newsk->backoff = 0;
3030 newsk->blog = 0;
3031 newsk->intr = 0;
3032 newsk->proc = 0;
3033 newsk->done = 0;
3034 newsk->partial = NULL;
3035 newsk->pair = NULL;
3036 newsk->wmem_alloc = 0;
3037 newsk->rmem_alloc = 0;
3038 newsk->localroute = sk->localroute;
3039
3040 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3041
3042 newsk->err = 0;
3043 newsk->shutdown = 0;
3044 newsk->ack_backlog = 0;
3045 newsk->acked_seq = skb->seq+1;
3046 newsk->copied_seq = skb->seq+1;
3047 newsk->fin_seq = skb->seq;
3048 newsk->state = TCP_SYN_RECV;
3049 newsk->timeout = 0;
3050 newsk->ip_xmit_timeout = 0;
3051 newsk->write_seq = seq;
3052 newsk->window_seq = newsk->write_seq;
3053 newsk->rcv_ack_seq = newsk->write_seq;
3054 newsk->urg_data = 0;
3055 newsk->retransmits = 0;
3056 newsk->linger=0;
3057 newsk->destroy = 0;
3058 init_timer(&newsk->timer);
3059 newsk->timer.data = (unsignedlong)newsk;
3060 newsk->timer.function = &net_timer;
3061 init_timer(&newsk->retransmit_timer);
3062 newsk->retransmit_timer.data = (unsignedlong)newsk;
3063 newsk->retransmit_timer.function=&retransmit_timer;
3064 newsk->dummy_th.source = skb->h.th->dest;
3065 newsk->dummy_th.dest = skb->h.th->source;
3066
3067 /*3068 * Swap these two, they are from our point of view. 3069 */3070
3071 newsk->daddr = saddr;
3072 newsk->saddr = daddr;
3073 newsk->rcv_saddr = daddr;
3074
3075 put_sock(newsk->num,newsk);
3076 newsk->dummy_th.res1 = 0;
3077 newsk->dummy_th.doff = 6;
3078 newsk->dummy_th.fin = 0;
3079 newsk->dummy_th.syn = 0;
3080 newsk->dummy_th.rst = 0;
3081 newsk->dummy_th.psh = 0;
3082 newsk->dummy_th.ack = 0;
3083 newsk->dummy_th.urg = 0;
3084 newsk->dummy_th.res2 = 0;
3085 newsk->acked_seq = skb->seq + 1;
3086 newsk->copied_seq = skb->seq + 1;
3087 newsk->socket = NULL;
3088
3089 /*3090 * Grab the ttl and tos values and use them 3091 */3092
3093 newsk->ip_ttl=sk->ip_ttl;
3094 newsk->ip_tos=skb->ip_hdr->tos;
3095
3096 /*3097 * Use 512 or whatever user asked for 3098 */3099
3100 /*3101 * Note use of sk->user_mss, since user has no direct access to newsk 3102 */3103
3104 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3105 newsk->ip_route_cache = rt;
3106
3107 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3108 newsk->window_clamp = rt->rt_window;
3109 else3110 newsk->window_clamp = 0;
3111
3112 if (sk->user_mss)
3113 newsk->mtu = sk->user_mss;
3114 elseif (rt)
3115 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
3116 else3117 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
3118
3119 /*3120 * But not bigger than device MTU 3121 */3122
3123 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
3124
3125 #ifdefCONFIG_SKIP3126
3127 /*3128 * SKIP devices set their MTU to 65535. This is so they can take packets3129 * unfragmented to security process then fragment. They could lie to the3130 * TCP layer about a suitable MTU, but its easier to let skip sort it out3131 * simply because the final package we want unfragmented is going to be3132 *3133 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]3134 */3135
3136 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */3137 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3138 #endif3139 /*3140 * This will min with what arrived in the packet 3141 */3142
3143 tcp_options(newsk,skb->h.th);
3144
3145 tcp_cache_zap();
3146
3147 buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3148 if (buff == NULL)
3149 {3150 sk->err = ENOMEM;
3151 newsk->dead = 1;
3152 newsk->state = TCP_CLOSE;
3153 /* And this will destroy it */3154 release_sock(newsk);
3155 kfree_skb(skb, FREE_READ);
3156 tcp_statistics.TcpAttemptFails++;
3157 return;
3158 }3159
3160 buff->sk = newsk;
3161 buff->localroute = newsk->localroute;
3162
3163 /*3164 * Put in the IP header and routing stuff. 3165 */3166
3167 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3168 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3169
3170 /*3171 * Something went wrong. 3172 */3173
3174 if (tmp < 0)
3175 {3176 sk->err = tmp;
3177 buff->free = 1;
3178 kfree_skb(buff,FREE_WRITE);
3179 newsk->dead = 1;
3180 newsk->state = TCP_CLOSE;
3181 release_sock(newsk);
3182 skb->sk = sk;
3183 kfree_skb(skb, FREE_READ);
3184 tcp_statistics.TcpAttemptFails++;
3185 return;
3186 }3187
3188 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
3189
3190 memcpy(t1, skb->h.th, sizeof(*t1));
3191 buff->seq = newsk->write_seq++;
3192 buff->end_seq = newsk->write_seq;
3193 /*3194 * Swap the send and the receive. 3195 */3196 t1->dest = skb->h.th->source;
3197 t1->source = newsk->dummy_th.source;
3198 t1->seq = ntohl(buff->seq);
3199 t1->ack = 1;
3200 newsk->window = tcp_select_window(newsk);
3201 newsk->sent_seq = newsk->write_seq;
3202 t1->window = ntohs(newsk->window);
3203 t1->res1 = 0;
3204 t1->res2 = 0;
3205 t1->rst = 0;
3206 t1->urg = 0;
3207 t1->psh = 0;
3208 t1->syn = 1;
3209 t1->ack_seq = htonl(newsk->acked_seq);
3210 t1->doff = sizeof(*t1)/4+1;
3211 ptr = skb_put(buff,4);
3212 ptr[0] = 2;
3213 ptr[1] = 4;
3214 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3215 ptr[3] =(newsk->mtu) & 0xff;
3216
3217 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3218 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3219 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3220 skb->sk = newsk;
3221
3222 /*3223 * Charge the sock_buff to newsk. 3224 */3225
3226 sk->rmem_alloc -= skb->truesize;
3227 newsk->rmem_alloc += skb->truesize;
3228
3229 skb_queue_tail(&sk->receive_queue,skb);
3230 sk->ack_backlog++;
3231 release_sock(newsk);
3232 tcp_statistics.TcpOutSegs++;
3233 }3234
3235
3236 staticvoidtcp_close(structsock *sk, inttimeout)
/* */3237 {3238 /*3239 * We need to grab some memory, and put together a FIN, 3240 * and then put it into the queue to be sent.3241 */3242
3243 sk->inuse = 1;
3244
3245 if(th_cache_sk==sk)
3246 tcp_cache_zap();
3247 if(sk->state == TCP_LISTEN)
3248 {3249 /* Special case */3250 tcp_set_state(sk, TCP_CLOSE);
3251 tcp_close_pending(sk);
3252 release_sock(sk);
3253 return;
3254 }3255
3256 sk->keepopen = 1;
3257 sk->shutdown = SHUTDOWN_MASK;
3258
3259 if (!sk->dead)
3260 sk->state_change(sk);
3261
3262 if (timeout == 0)
3263 {3264 structsk_buff *skb;
3265
3266 /*3267 * We need to flush the recv. buffs. We do this only on the3268 * descriptor close, not protocol-sourced closes, because the3269 * reader process may not have drained the data yet!3270 */3271
3272 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3273 kfree_skb(skb, FREE_READ);
3274 /*3275 * Get rid off any half-completed packets. 3276 */3277
3278 if (sk->partial)
3279 tcp_send_partial(sk);
3280 }3281
3282
3283 /*3284 * Timeout is not the same thing - however the code likes3285 * to send both the same way (sigh).3286 */3287
3288 if(timeout)
3289 {3290 tcp_set_state(sk, TCP_CLOSE); /* Dead */3291 }3292 else3293 {3294 if(tcp_close_state(sk,1)==1)
3295 {3296 tcp_send_fin(sk);
3297 }3298 }3299 release_sock(sk);
3300 }3301
3302
3303 /*3304 * This routine takes stuff off of the write queue,3305 * and puts it in the xmit queue. This happens as incoming acks3306 * open up the remote window for us.3307 */3308
3309 staticvoidtcp_write_xmit(structsock *sk)
/* */3310 {3311 structsk_buff *skb;
3312
3313 /*3314 * The bytes will have to remain here. In time closedown will3315 * empty the write queue and all will be happy 3316 */3317
3318 if(sk->zapped)
3319 return;
3320
3321 /*3322 * Anything on the transmit queue that fits the window can3323 * be added providing we are not3324 *3325 * a) retransmitting (Nagle's rule)3326 * b) exceeding our congestion window.3327 */3328
3329 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3330 before(skb->end_seq, sk->window_seq + 1) &&
3331 (sk->retransmits == 0 ||
3332 sk->ip_xmit_timeout != TIME_WRITE ||
3333 before(skb->end_seq, sk->rcv_ack_seq + 1))
3334 && sk->packets_out < sk->cong_window)
3335 {3336 IS_SKB(skb);
3337 skb_unlink(skb);
3338
3339 /*3340 * See if we really need to send the packet. 3341 */3342
3343 if (before(skb->end_seq, sk->rcv_ack_seq +1))
3344 {3345 /*3346 * This is acked data. We can discard it. This 3347 * cannot currently occur.3348 */3349
3350 sk->retransmits = 0;
3351 kfree_skb(skb, FREE_WRITE);
3352 if (!sk->dead)
3353 sk->write_space(sk);
3354 }3355 else3356 {3357 structtcphdr *th;
3358 structiphdr *iph;
3359 intsize;
3360 /*3361 * put in the ack seq and window at this point rather than earlier,3362 * in order to keep them monotonic. We really want to avoid taking3363 * back window allocations. That's legal, but RFC1122 says it's frowned on.3364 * Ack and window will in general have changed since this packet was put3365 * on the write queue.3366 */3367 iph = skb->ip_hdr;
3368 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3369 size = skb->len - (((unsignedchar *) th) - skb->data);
3370 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY3371 if (size > sk->mtu - sizeof(structiphdr))
3372 {3373 iph->frag_off &= ~htons(IP_DF);
3374 ip_send_check(iph);
3375 }3376 #endif3377
3378 th->ack_seq = htonl(sk->acked_seq);
3379 th->window = htons(tcp_select_window(sk));
3380
3381 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3382
3383 sk->sent_seq = skb->end_seq;
3384
3385 /*3386 * IP manages our queue for some crazy reason3387 */3388
3389 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3390
3391 /*3392 * Again we slide the timer wrongly3393 */3394
3395 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3396 }3397 }3398 }3399
3400
3401 /*3402 * This routine deals with incoming acks, but not outgoing ones.3403 */3404
3405 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3406 {3407 u32ack;
3408 intflag = 0;
3409
3410 /* 3411 * 1 - there was data in packet as well as ack or new data is sent or 3412 * in shutdown state3413 * 2 - data from retransmit queue was acked and removed3414 * 4 - window shrunk or data from retransmit queue was acked and removed3415 */3416
3417 if(sk->zapped)
3418 return(1); /* Dead, cant ack any more so why bother */3419
3420 /*3421 * Have we discovered a larger window3422 */3423
3424 ack = ntohl(th->ack_seq);
3425
3426 if (ntohs(th->window) > sk->max_window)
3427 {3428 sk->max_window = ntohs(th->window);
3429 #ifdefCONFIG_INET_PCTCP3430 /* Hack because we don't send partial packets to non SWS3431 handling hosts */3432 sk->mss = min(sk->max_window>>1, sk->mtu);
3433 #else3434 sk->mss = min(sk->max_window, sk->mtu);
3435 #endif3436 }3437
3438 /*3439 * We have dropped back to keepalive timeouts. Thus we have3440 * no retransmits pending.3441 */3442
3443 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3444 sk->retransmits = 0;
3445
3446 /*3447 * If the ack is newer than sent or older than previous acks3448 * then we can probably ignore it.3449 */3450
3451 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3452 {3453 if(sk->debug)
3454 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3455
3456 /*3457 * Keepalive processing.3458 */3459
3460 if (after(ack, sk->sent_seq))
3461 {3462 return(0);
3463 }3464
3465 /*3466 * Restart the keepalive timer.3467 */3468
3469 if (sk->keepopen)
3470 {3471 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3472 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3473 }3474 return(1);
3475 }3476
3477 /*3478 * If there is data set flag 13479 */3480
3481 if (len != th->doff*4)
3482 flag |= 1;
3483
3484 /*3485 * See if our window has been shrunk. 3486 */3487
3488 if (after(sk->window_seq, ack+ntohs(th->window)))
3489 {3490 /*3491 * We may need to move packets from the send queue3492 * to the write queue, if the window has been shrunk on us.3493 * The RFC says you are not allowed to shrink your window3494 * like this, but if the other end does, you must be able3495 * to deal with it.3496 */3497 structsk_buff *skb;
3498 structsk_buff *skb2;
3499 structsk_buff *wskb = NULL;
3500
3501 skb2 = sk->send_head;
3502 sk->send_head = NULL;
3503 sk->send_tail = NULL;
3504
3505 /*3506 * This is an artifact of a flawed concept. We want one3507 * queue and a smarter send routine when we send all.3508 */3509
3510 flag |= 4; /* Window changed */3511
3512 sk->window_seq = ack + ntohs(th->window);
3513 cli();
3514 while (skb2 != NULL)
3515 {3516 skb = skb2;
3517 skb2 = skb->link3;
3518 skb->link3 = NULL;
3519 if (after(skb->end_seq, sk->window_seq))
3520 {3521 if (sk->packets_out > 0)
3522 sk->packets_out--;
3523 /* We may need to remove this from the dev send list. */3524 if (skb->next != NULL)
3525 {3526 skb_unlink(skb);
3527 }3528 /* Now add it to the write_queue. */3529 if (wskb == NULL)
3530 skb_queue_head(&sk->write_queue,skb);
3531 else3532 skb_append(wskb,skb);
3533 wskb = skb;
3534 }3535 else3536 {3537 if (sk->send_head == NULL)
3538 {3539 sk->send_head = skb;
3540 sk->send_tail = skb;
3541 }3542 else3543 {3544 sk->send_tail->link3 = skb;
3545 sk->send_tail = skb;
3546 }3547 skb->link3 = NULL;
3548 }3549 }3550 sti();
3551 }3552
3553 /*3554 * Pipe has emptied3555 */3556
3557 if (sk->send_tail == NULL || sk->send_head == NULL)
3558 {3559 sk->send_head = NULL;
3560 sk->send_tail = NULL;
3561 sk->packets_out= 0;
3562 }3563
3564 /*3565 * Update the right hand window edge of the host3566 */3567
3568 sk->window_seq = ack + ntohs(th->window);
3569
3570 /*3571 * We don't want too many packets out there. 3572 */3573
3574 if (sk->ip_xmit_timeout == TIME_WRITE &&
3575 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3576 {3577 /* 3578 * This is Jacobson's slow start and congestion avoidance. 3579 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3580 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3581 * counter and increment it once every cwnd times. It's possible3582 * that this should be done only if sk->retransmits == 0. I'm3583 * interpreting "new data is acked" as including data that has3584 * been retransmitted but is just now being acked.3585 */3586 if (sk->cong_window < sk->ssthresh)
3587 /* 3588 * In "safe" area, increase3589 */3590 sk->cong_window++;
3591 else3592 {3593 /*3594 * In dangerous area, increase slowly. In theory this is3595 * sk->cong_window += 1 / sk->cong_window3596 */3597 if (sk->cong_count >= sk->cong_window)
3598 {3599 sk->cong_window++;
3600 sk->cong_count = 0;
3601 }3602 else3603 sk->cong_count++;
3604 }3605 }3606
3607 /*3608 * Remember the highest ack received.3609 */3610
3611 sk->rcv_ack_seq = ack;
3612
3613 /*3614 * We passed data and got it acked, remove any soft error3615 * log. Something worked...3616 */3617
3618 sk->err_soft = 0;
3619
3620 /*3621 * If this ack opens up a zero window, clear backoff. It was3622 * being used to time the probes, and is probably far higher than3623 * it needs to be for normal retransmission.3624 */3625
3626 if (sk->ip_xmit_timeout == TIME_PROBE0)
3627 {3628 sk->retransmits = 0; /* Our probe was answered */3629
3630 /*3631 * Was it a usable window open ?3632 */3633
3634 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3635 ! before (sk->window_seq, sk->write_queue.next->end_seq))
3636 {3637 sk->backoff = 0;
3638
3639 /*3640 * Recompute rto from rtt. this eliminates any backoff.3641 */3642
3643 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3644 if (sk->rto > 120*HZ)
3645 sk->rto = 120*HZ;
3646 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about3647 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3648 .2 of a second is going to need huge windows (SIGH) */3649 sk->rto = HZ/5;
3650 }3651 }3652
3653 /* 3654 * See if we can take anything off of the retransmit queue.3655 */3656
3657 while(sk->send_head != NULL)
3658 {3659 /* Check for a bug. */3660 if (sk->send_head->link3 &&
3661 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
3662 printk("INET: tcp.c: *** bug send_list out of order.\n");
3663
3664 /*3665 * If our packet is before the ack sequence we can3666 * discard it as it's confirmed to have arrived the other end.3667 */3668
3669 if (before(sk->send_head->end_seq, ack+1))
3670 {3671 structsk_buff *oskb;
3672 if (sk->retransmits)
3673 {3674 /*3675 * We were retransmitting. don't count this in RTT est 3676 */3677 flag |= 2;
3678
3679 /*3680 * even though we've gotten an ack, we're still3681 * retransmitting as long as we're sending from3682 * the retransmit queue. Keeping retransmits non-zero3683 * prevents us from getting new data interspersed with3684 * retransmissions.3685 */3686
3687 if (sk->send_head->link3) /* Any more queued retransmits? */3688 sk->retransmits = 1;
3689 else3690 sk->retransmits = 0;
3691 }3692 /*3693 * Note that we only reset backoff and rto in the3694 * rtt recomputation code. And that doesn't happen3695 * if there were retransmissions in effect. So the3696 * first new packet after the retransmissions is3697 * sent with the backoff still in effect. Not until3698 * we get an ack from a non-retransmitted packet do3699 * we reset the backoff and rto. This allows us to deal3700 * with a situation where the network delay has increased3701 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3702 */3703
3704 /*3705 * We have one less packet out there. 3706 */3707
3708 if (sk->packets_out > 0)
3709 sk->packets_out --;
3710 /* 3711 * Wake up the process, it can probably write more. 3712 */3713 if (!sk->dead)
3714 sk->write_space(sk);
3715 oskb = sk->send_head;
3716
3717 if (!(flag&2)) /* Not retransmitting */3718 {3719 longm;
3720
3721 /*3722 * The following amusing code comes from Jacobson's3723 * article in SIGCOMM '88. Note that rtt and mdev3724 * are scaled versions of rtt and mean deviation.3725 * This is designed to be as fast as possible 3726 * m stands for "measurement".3727 */3728
3729 m = jiffies - oskb->when; /* RTT */3730 if(m<=0)
3731 m=1; /* IS THIS RIGHT FOR <0 ??? */3732 m -= (sk->rtt >> 3); /* m is now error in rtt est */3733 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3734 if (m < 0)
3735 m = -m; /* m is now abs(error) */3736 m -= (sk->mdev >> 2); /* similar update on mdev */3737 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3738
3739 /*3740 * Now update timeout. Note that this removes any backoff.3741 */3742
3743 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3744 if (sk->rto > 120*HZ)
3745 sk->rto = 120*HZ;
3746 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3747 sk->rto = HZ/5;
3748 sk->backoff = 0;
3749 }3750 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3751 In this case as we just set it up */3752 cli();
3753 oskb = sk->send_head;
3754 IS_SKB(oskb);
3755 sk->send_head = oskb->link3;
3756 if (sk->send_head == NULL)
3757 {3758 sk->send_tail = NULL;
3759 }3760
3761 /*3762 * We may need to remove this from the dev send list. 3763 */3764
3765 if (oskb->next)
3766 skb_unlink(oskb);
3767 sti();
3768 kfree_skb(oskb, FREE_WRITE); /* write. */3769 if (!sk->dead)
3770 sk->write_space(sk);
3771 }3772 else3773 {3774 break;
3775 }3776 }3777
3778 /*3779 * XXX someone ought to look at this too.. at the moment, if skb_peek()3780 * returns non-NULL, we complete ignore the timer stuff in the else3781 * clause. We ought to organize the code so that else clause can3782 * (should) be executed regardless, possibly moving the PROBE timer3783 * reset over. The skb_peek() thing should only move stuff to the3784 * write queue, NOT also manage the timer functions.3785 */3786
3787 /*3788 * Maybe we can take some stuff off of the write queue,3789 * and put it onto the xmit queue.3790 */3791 if (skb_peek(&sk->write_queue) != NULL)
3792 {3793 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3794 (sk->retransmits == 0 ||
3795 sk->ip_xmit_timeout != TIME_WRITE ||
3796 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3797 && sk->packets_out < sk->cong_window)
3798 {3799 /*3800 * Add more data to the send queue.3801 */3802 flag |= 1;
3803 tcp_write_xmit(sk);
3804 }3805 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3806 sk->send_head == NULL &&
3807 sk->ack_backlog == 0 &&
3808 sk->state != TCP_TIME_WAIT)
3809 {3810 /*3811 * Data to queue but no room.3812 */3813 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3814 }3815 }3816 else3817 {3818 /*3819 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3820 * from TCP_CLOSE we don't do anything3821 *3822 * from anything else, if there is write data (or fin) pending,3823 * we use a TIME_WRITE timeout, else if keepalive we reset to3824 * a KEEPALIVE timeout, else we delete the timer.3825 *3826 * We do not set flag for nominal write data, otherwise we may3827 * force a state where we start to write itsy bitsy tidbits3828 * of data.3829 */3830
3831 switch(sk->state) {3832 caseTCP_TIME_WAIT:
3833 /*3834 * keep us in TIME_WAIT until we stop getting packets,3835 * reset the timeout.3836 */3837 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3838 break;
3839 caseTCP_CLOSE:
3840 /*3841 * don't touch the timer.3842 */3843 break;
3844 default:
3845 /*3846 * Must check send_head, write_queue, and ack_backlog3847 * to determine which timeout to use.3848 */3849 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3850 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3851 }elseif (sk->keepopen) {3852 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3853 }else{3854 del_timer(&sk->retransmit_timer);
3855 sk->ip_xmit_timeout = 0;
3856 }3857 break;
3858 }3859 }3860
3861 /*3862 * We have nothing queued but space to send. Send any partial3863 * packets immediately (end of Nagle rule application).3864 */3865
3866 if (sk->packets_out == 0 && sk->partial != NULL &&
3867 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3868 {3869 flag |= 1;
3870 tcp_send_partial(sk);
3871 }3872
3873 /*3874 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3875 * we are now waiting for an acknowledge to our FIN. The other end is3876 * already in TIME_WAIT.3877 *3878 * Move to TCP_CLOSE on success.3879 */3880
3881 if (sk->state == TCP_LAST_ACK)
3882 {3883 if (!sk->dead)
3884 sk->state_change(sk);
3885 if(sk->debug)
3886 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3887 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3888 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3889 {3890 flag |= 1;
3891 sk->shutdown = SHUTDOWN_MASK;
3892 tcp_set_state(sk,TCP_CLOSE);
3893 return 1;
3894 }3895 }3896
3897 /*3898 * Incoming ACK to a FIN we sent in the case of our initiating the close.3899 *3900 * Move to FIN_WAIT2 to await a FIN from the other end. Set3901 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3902 */3903
3904 if (sk->state == TCP_FIN_WAIT1)
3905 {3906
3907 if (!sk->dead)
3908 sk->state_change(sk);
3909 if (sk->rcv_ack_seq == sk->write_seq)
3910 {3911 flag |= 1;
3912 sk->shutdown |= SEND_SHUTDOWN;
3913 tcp_set_state(sk, TCP_FIN_WAIT2);
3914 }3915 }3916
3917 /*3918 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3919 *3920 * Move to TIME_WAIT3921 */3922
3923 if (sk->state == TCP_CLOSING)
3924 {3925
3926 if (!sk->dead)
3927 sk->state_change(sk);
3928 if (sk->rcv_ack_seq == sk->write_seq)
3929 {3930 flag |= 1;
3931 tcp_time_wait(sk);
3932 }3933 }3934
3935 /*3936 * Final ack of a three way shake 3937 */3938
3939 if(sk->state==TCP_SYN_RECV)
3940 {3941 tcp_set_state(sk, TCP_ESTABLISHED);
3942 tcp_options(sk,th);
3943 sk->dummy_th.dest=th->source;
3944 sk->copied_seq = sk->acked_seq;
3945 if(!sk->dead)
3946 sk->state_change(sk);
3947 if(sk->max_window==0)
3948 {3949 sk->max_window=32; /* Sanity check */3950 sk->mss=min(sk->max_window,sk->mtu);
3951 }3952 }3953
3954 /*3955 * I make no guarantees about the first clause in the following3956 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3957 * what conditions "!flag" would be true. However I think the rest3958 * of the conditions would prevent that from causing any3959 * unnecessary retransmission. 3960 * Clearly if the first packet has expired it should be 3961 * retransmitted. The other alternative, "flag&2 && retransmits", is3962 * harder to explain: You have to look carefully at how and when the3963 * timer is set and with what timeout. The most recent transmission always3964 * sets the timer. So in general if the most recent thing has timed3965 * out, everything before it has as well. So we want to go ahead and3966 * retransmit some more. If we didn't explicitly test for this3967 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3968 * would not be true. If you look at the pattern of timing, you can3969 * show that rto is increased fast enough that the next packet would3970 * almost never be retransmitted immediately. Then you'd end up3971 * waiting for a timeout to send each packet on the retransmission3972 * queue. With my implementation of the Karn sampling algorithm,3973 * the timeout would double each time. The net result is that it would3974 * take a hideous amount of time to recover from a single dropped packet.3975 * It's possible that there should also be a test for TIME_WRITE, but3976 * I think as long as "send_head != NULL" and "retransmit" is on, we've3977 * got to be in real retransmission mode.3978 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3979 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3980 * As long as no further losses occur, this seems reasonable.3981 */3982
3983 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3984 (((flag&2) && sk->retransmits) ||
3985 (sk->send_head->when + sk->rto < jiffies)))
3986 {3987 if(sk->send_head->when + sk->rto < jiffies)
3988 tcp_retransmit(sk,0);
3989 else3990 {3991 tcp_do_retransmit(sk, 1);
3992 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3993 }3994 }3995
3996 return(1);
3997 }3998
3999
4000 /*4001 * Process the FIN bit. This now behaves as it is supposed to work4002 * and the FIN takes effect when it is validly part of sequence4003 * space. Not before when we get holes.4004 *4005 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT4006 * (and thence onto LAST-ACK and finally, CLOSE, we never enter4007 * TIME-WAIT)4008 *4009 * If we are in FINWAIT-1, a received FIN indicates simultaneous4010 * close and we go into CLOSING (and later onto TIME-WAIT)4011 *4012 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.4013 *4014 */4015
4016 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */4017 {4018 sk->fin_seq = skb->end_seq;
4019
4020 if (!sk->dead)
4021 {4022 sk->state_change(sk);
4023 sock_wake_async(sk->socket, 1);
4024 }4025
4026 switch(sk->state)
4027 {4028 caseTCP_SYN_RECV:
4029 caseTCP_SYN_SENT:
4030 caseTCP_ESTABLISHED:
4031 /*4032 * move to CLOSE_WAIT, tcp_data() already handled4033 * sending the ack.4034 */4035 tcp_set_state(sk,TCP_CLOSE_WAIT);
4036 if (th->rst)
4037 sk->shutdown = SHUTDOWN_MASK;
4038 break;
4039
4040 caseTCP_CLOSE_WAIT:
4041 caseTCP_CLOSING:
4042 /*4043 * received a retransmission of the FIN, do4044 * nothing.4045 */4046 break;
4047 caseTCP_TIME_WAIT:
4048 /*4049 * received a retransmission of the FIN,4050 * restart the TIME_WAIT timer.4051 */4052 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4053 return(0);
4054 caseTCP_FIN_WAIT1:
4055 /*4056 * This case occurs when a simultaneous close4057 * happens, we must ack the received FIN and4058 * enter the CLOSING state.4059 *4060 * This causes a WRITE timeout, which will either4061 * move on to TIME_WAIT when we timeout, or resend4062 * the FIN properly (maybe we get rid of that annoying4063 * FIN lost hang). The TIME_WRITE code is already correct4064 * for handling this timeout.4065 */4066
4067 if(sk->ip_xmit_timeout != TIME_WRITE)
4068 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4069 tcp_set_state(sk,TCP_CLOSING);
4070 break;
4071 caseTCP_FIN_WAIT2:
4072 /*4073 * received a FIN -- send ACK and enter TIME_WAIT4074 */4075 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4076 sk->shutdown|=SHUTDOWN_MASK;
4077 tcp_set_state(sk,TCP_TIME_WAIT);
4078 break;
4079 caseTCP_CLOSE:
4080 /*4081 * already in CLOSE4082 */4083 break;
4084 default:
4085 tcp_set_state(sk,TCP_LAST_ACK);
4086
4087 /* Start the timers. */4088 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4089 return(0);
4090 }4091
4092 return(0);
4093 }4094
4095
4096
4097 /*4098 * This routine handles the data. If there is room in the buffer,4099 * it will be have already been moved into it. If there is no4100 * room, then we will just have to discard the packet.4101 */4102
4103 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */4104 unsignedlongsaddr, unsignedshortlen)
4105 {4106 structsk_buff *skb1, *skb2;
4107 structtcphdr *th;
4108 intdup_dumped=0;
4109 u32new_seq, shut_seq;
4110
4111 th = skb->h.th;
4112 skb_pull(skb,th->doff*4);
4113 skb_trim(skb,len-(th->doff*4));
4114
4115 /*4116 * The bytes in the receive read/assembly queue has increased. Needed for the4117 * low memory discard algorithm 4118 */4119
4120 sk->bytes_rcv += skb->len;
4121
4122 if (skb->len == 0 && !th->fin)
4123 {4124 /* 4125 * Don't want to keep passing ack's back and forth. 4126 * (someone sent us dataless, boring frame)4127 */4128 if (!th->ack)
4129 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4130 kfree_skb(skb, FREE_READ);
4131 return(0);
4132 }4133
4134 /*4135 * We no longer have anyone receiving data on this connection.4136 */4137
4138 #ifndef TCP_DONT_RST_SHUTDOWN
4139
4140 if(sk->shutdown & RCV_SHUTDOWN)
4141 {4142 /*4143 * FIXME: BSD has some magic to avoid sending resets to4144 * broken 4.2 BSD keepalives. Much to my surprise a few non4145 * BSD stacks still have broken keepalives so we want to4146 * cope with it.4147 */4148
4149 if(skb->len) /* We don't care if it's just an ack or4150 a keepalive/window probe */4151 {4152 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */4153
4154 /* Do this the way 4.4BSD treats it. Not what I'd4155 regard as the meaning of the spec but it's what BSD4156 does and clearly they know everything 8) */4157
4158 /*4159 * This is valid because of two things4160 *4161 * a) The way tcp_data behaves at the bottom.4162 * b) A fin takes effect when read not when received.4163 */4164
4165 shut_seq = sk->acked_seq+1; /* Last byte */4166
4167 if(after(new_seq,shut_seq))
4168 {4169 if(sk->debug)
4170 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4171 sk, new_seq, shut_seq, sk->blog);
4172 if(sk->dead)
4173 {4174 sk->acked_seq = new_seq + th->fin;
4175 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4176 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4177 tcp_statistics.TcpEstabResets++;
4178 sk->err = EPIPE;
4179 sk->error_report(sk);
4180 sk->shutdown = SHUTDOWN_MASK;
4181 tcp_set_state(sk,TCP_CLOSE);
4182 kfree_skb(skb, FREE_READ);
4183 return 0;
4184 }4185 }4186 }4187 }4188
4189 #endif4190
4191 /*4192 * Now we have to walk the chain, and figure out where this one4193 * goes into it. This is set up so that the last packet we received4194 * will be the first one we look at, that way if everything comes4195 * in order, there will be no performance loss, and if they come4196 * out of order we will be able to fit things in nicely.4197 *4198 * [AC: This is wrong. We should assume in order first and then walk4199 * forwards from the first hole based upon real traffic patterns.]4200 * 4201 */4202
4203 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */4204 {4205 skb_queue_head(&sk->receive_queue,skb);
4206 skb1= NULL;
4207 }4208 else4209 {4210 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4211 {4212 if(sk->debug)
4213 {4214 printk("skb1=%p :", skb1);
4215 printk("skb1->seq = %d: ", skb1->seq);
4216 printk("skb->seq = %d\n",skb->seq);
4217 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4218 sk->acked_seq);
4219 }4220
4221 /*4222 * Optimisation: Duplicate frame or extension of previous frame from4223 * same sequence point (lost ack case).4224 * The frame contains duplicate data or replaces a previous frame4225 * discard the previous frame (safe as sk->inuse is set) and put4226 * the new one in its place.4227 */4228
4229 if (skb->seq==skb1->seq && skb->len>=skb1->len)
4230 {4231 skb_append(skb1,skb);
4232 skb_unlink(skb1);
4233 kfree_skb(skb1,FREE_READ);
4234 dup_dumped=1;
4235 skb1=NULL;
4236 break;
4237 }4238
4239 /*4240 * Found where it fits4241 */4242
4243 if (after(skb->seq+1, skb1->seq))
4244 {4245 skb_append(skb1,skb);
4246 break;
4247 }4248
4249 /*4250 * See if we've hit the start. If so insert.4251 */4252 if (skb1 == skb_peek(&sk->receive_queue))
4253 {4254 skb_queue_head(&sk->receive_queue, skb);
4255 break;
4256 }4257 }4258 }4259
4260 /*4261 * Figure out what the ack value for this frame is4262 */4263
4264 if (before(sk->acked_seq, sk->copied_seq))
4265 {4266 printk("*** tcp.c:tcp_data bug acked < copied\n");
4267 sk->acked_seq = sk->copied_seq;
4268 }4269
4270 /*4271 * Now figure out if we can ack anything. This is very messy because we really want two4272 * receive queues, a completed and an assembly queue. We also want only one transmit4273 * queue.4274 */4275
4276 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
4277 {4278 if (before(skb->seq, sk->acked_seq+1))
4279 {4280 intnewwindow;
4281
4282 if (after(skb->end_seq, sk->acked_seq))
4283 {4284 newwindow = sk->window - (skb->end_seq - sk->acked_seq);
4285 if (newwindow < 0)
4286 newwindow = 0;
4287 sk->window = newwindow;
4288 sk->acked_seq = skb->end_seq;
4289 }4290 skb->acked = 1;
4291
4292 /*4293 * When we ack the fin, we do the FIN 4294 * processing.4295 */4296
4297 if (skb->h.th->fin)
4298 {4299 tcp_fin(skb,sk,skb->h.th);
4300 }4301
4302 for(skb2 = skb->next;
4303 skb2 != (structsk_buff *)&sk->receive_queue;
4304 skb2 = skb2->next)
4305 {4306 if (before(skb2->seq, sk->acked_seq+1))
4307 {4308 if (after(skb2->end_seq, sk->acked_seq))
4309 {4310 newwindow = sk->window -
4311 (skb2->end_seq - sk->acked_seq);
4312 if (newwindow < 0)
4313 newwindow = 0;
4314 sk->window = newwindow;
4315 sk->acked_seq = skb2->end_seq;
4316 }4317 skb2->acked = 1;
4318 /*4319 * When we ack the fin, we do4320 * the fin handling.4321 */4322 if (skb2->h.th->fin)
4323 {4324 tcp_fin(skb,sk,skb->h.th);
4325 }4326
4327 /*4328 * Force an immediate ack.4329 */4330
4331 sk->ack_backlog = sk->max_ack_backlog;
4332 }4333 else4334 {4335 break;
4336 }4337 }4338
4339 /*4340 * This also takes care of updating the window.4341 * This if statement needs to be simplified.4342 */4343 if (!sk->delay_acks ||
4344 sk->ack_backlog >= sk->max_ack_backlog ||
4345 sk->bytes_rcv > sk->max_unacked || th->fin) {4346 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4347 }4348 else4349 {4350 sk->ack_backlog++;
4351 if(sk->debug)
4352 printk("Ack queued.\n");
4353 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4354 }4355 }4356 }4357
4358 /*4359 * If we've missed a packet, send an ack.4360 * Also start a timer to send another.4361 */4362
4363 if (!skb->acked)
4364 {4365
4366 /*4367 * This is important. If we don't have much room left,4368 * we need to throw out a few packets so we have a good4369 * window. Note that mtu is used, not mss, because mss is really4370 * for the send side. He could be sending us stuff as large as mtu.4371 */4372
4373 while (sock_rspace(sk) < sk->mtu)
4374 {4375 skb1 = skb_peek(&sk->receive_queue);
4376 if (skb1 == NULL)
4377 {4378 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4379 break;
4380 }4381
4382 /*4383 * Don't throw out something that has been acked. 4384 */4385
4386 if (skb1->acked)
4387 {4388 break;
4389 }4390
4391 skb_unlink(skb1);
4392 kfree_skb(skb1, FREE_READ);
4393 }4394 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4395 sk->ack_backlog++;
4396 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4397 }4398 else4399 {4400 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4401 }4402
4403 /*4404 * Now tell the user we may have some data. 4405 */4406
4407 if (!sk->dead)
4408 {4409 if(sk->debug)
4410 printk("Data wakeup.\n");
4411 sk->data_ready(sk,0);
4412 }4413 return(0);
4414 }4415
4416
4417 /*4418 * This routine is only called when we have urgent data4419 * signalled. Its the 'slow' part of tcp_urg. It could be4420 * moved inline now as tcp_urg is only called from one4421 * place. We handle URGent data wrong. We have to - as4422 * BSD still doesn't use the correction from RFC961.4423 */4424
4425 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4426 {4427 u32ptr = ntohs(th->urg_ptr);
4428
4429 if (ptr)
4430 ptr--;
4431 ptr += ntohl(th->seq);
4432
4433 /* ignore urgent data that we've already seen and read */4434 if (after(sk->copied_seq, ptr))
4435 return;
4436
4437 /* do we already have a newer (or duplicate) urgent pointer? */4438 if (sk->urg_data && !after(ptr, sk->urg_seq))
4439 return;
4440
4441 /* tell the world about our new urgent pointer */4442 if (sk->proc != 0) {4443 if (sk->proc > 0) {4444 kill_proc(sk->proc, SIGURG, 1);
4445 }else{4446 kill_pg(-sk->proc, SIGURG, 1);
4447 }4448 }4449 sk->urg_data = URG_NOTYET;
4450 sk->urg_seq = ptr;
4451 }4452
4453 /*4454 * This is the 'fast' part of urgent handling.4455 */4456
4457 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4458 unsignedlongsaddr, unsignedlonglen)
4459 {4460 u32ptr;
4461
4462 /*4463 * Check if we get a new urgent pointer - normally not 4464 */4465
4466 if (th->urg)
4467 tcp_check_urg(sk,th);
4468
4469 /*4470 * Do we wait for any urgent data? - normally not4471 */4472
4473 if (sk->urg_data != URG_NOTYET)
4474 return 0;
4475
4476 /*4477 * Is the urgent pointer pointing into this packet? 4478 */4479
4480 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4481 if (ptr >= len)
4482 return 0;
4483
4484 /*4485 * Ok, got the correct packet, update info 4486 */4487
4488 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4489 if (!sk->dead)
4490 sk->data_ready(sk,0);
4491 return 0;
4492 }4493
4494 /*4495 * This will accept the next outstanding connection. 4496 */4497
4498 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4499 {4500 structsock *newsk;
4501 structsk_buff *skb;
4502
4503 /*4504 * We need to make sure that this socket is listening,4505 * and that it has something pending.4506 */4507
4508 if (sk->state != TCP_LISTEN)
4509 {4510 sk->err = EINVAL;
4511 return(NULL);
4512 }4513
4514 /* Avoid the race. */4515 cli();
4516 sk->inuse = 1;
4517
4518 while((skb = tcp_dequeue_established(sk)) == NULL)
4519 {4520 if (flags & O_NONBLOCK)
4521 {4522 sti();
4523 release_sock(sk);
4524 sk->err = EAGAIN;
4525 return(NULL);
4526 }4527
4528 release_sock(sk);
4529 interruptible_sleep_on(sk->sleep);
4530 if (current->signal & ~current->blocked)
4531 {4532 sti();
4533 sk->err = ERESTARTSYS;
4534 return(NULL);
4535 }4536 sk->inuse = 1;
4537 }4538 sti();
4539
4540 /*4541 * Now all we need to do is return skb->sk. 4542 */4543
4544 newsk = skb->sk;
4545
4546 kfree_skb(skb, FREE_READ);
4547 sk->ack_backlog--;
4548 release_sock(sk);
4549 return(newsk);
4550 }4551
4552
4553 /*4554 * This will initiate an outgoing connection. 4555 */4556
4557 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4558 {4559 structsk_buff *buff;
4560 structdevice *dev=NULL;
4561 unsignedchar *ptr;
4562 inttmp;
4563 intatype;
4564 structtcphdr *t1;
4565 structrtable *rt;
4566
4567 if (sk->state != TCP_CLOSE)
4568 return(-EISCONN);
4569
4570 /*4571 * Don't allow a double connect.4572 */4573
4574 if(sk->daddr)
4575 return -EINVAL;
4576
4577 if (addr_len < 8)
4578 return(-EINVAL);
4579
4580 if (usin->sin_family && usin->sin_family != AF_INET)
4581 return(-EAFNOSUPPORT);
4582
4583 /*4584 * connect() to INADDR_ANY means loopback (BSD'ism).4585 */4586
4587 if(usin->sin_addr.s_addr==INADDR_ANY)
4588 usin->sin_addr.s_addr=ip_my_addr();
4589
4590 /*4591 * Don't want a TCP connection going to a broadcast address 4592 */4593
4594 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4595 return -ENETUNREACH;
4596
4597 sk->inuse = 1;
4598 sk->daddr = usin->sin_addr.s_addr;
4599 sk->write_seq = tcp_init_seq();
4600 sk->window_seq = sk->write_seq;
4601 sk->rcv_ack_seq = sk->write_seq -1;
4602 sk->err = 0;
4603 sk->dummy_th.dest = usin->sin_port;
4604 release_sock(sk);
4605
4606 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4607 if (buff == NULL)
4608 {4609 return(-ENOMEM);
4610 }4611 sk->inuse = 1;
4612 buff->sk = sk;
4613 buff->free = 0;
4614 buff->localroute = sk->localroute;
4615
4616
4617 /*4618 * Put in the IP header and routing stuff.4619 */4620
4621 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4622 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4623 if (tmp < 0)
4624 {4625 sock_wfree(sk, buff);
4626 release_sock(sk);
4627 return(-ENETUNREACH);
4628 }4629 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4630 sk->saddr = rt->rt_src;
4631 sk->rcv_saddr = sk->saddr;
4632
4633 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
4634
4635 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4636 buff->seq = sk->write_seq++;
4637 t1->seq = htonl(buff->seq);
4638 sk->sent_seq = sk->write_seq;
4639 buff->end_seq = sk->write_seq;
4640 t1->ack = 0;
4641 t1->window = 2;
4642 t1->res1=0;
4643 t1->res2=0;
4644 t1->rst = 0;
4645 t1->urg = 0;
4646 t1->psh = 0;
4647 t1->syn = 1;
4648 t1->urg_ptr = 0;
4649 t1->doff = 6;
4650 /* use 512 or whatever user asked for */4651
4652 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4653 sk->window_clamp=rt->rt_window;
4654 else4655 sk->window_clamp=0;
4656
4657 if (sk->user_mss)
4658 sk->mtu = sk->user_mss;
4659 elseif (rt)
4660 sk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
4661 else4662 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
4663
4664 /*4665 * but not bigger than device MTU 4666 */4667
4668 if(sk->mtu <32)
4669 sk->mtu = 32; /* Sanity limit */4670
4671 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
4672
4673 #ifdefCONFIG_SKIP4674
4675 /*4676 * SKIP devices set their MTU to 65535. This is so they can take packets4677 * unfragmented to security process then fragment. They could lie to the4678 * TCP layer about a suitable MTU, but its easier to let skip sort it out4679 * simply because the final package we want unfragmented is going to be4680 *4681 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]4682 */4683
4684 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */4685 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4686 #endif4687
4688 /*4689 * Put in the TCP options to say MTU. 4690 */4691
4692 ptr = skb_put(buff,4);
4693 ptr[0] = 2;
4694 ptr[1] = 4;
4695 ptr[2] = (sk->mtu) >> 8;
4696 ptr[3] = (sk->mtu) & 0xff;
4697 tcp_send_check(t1, sk->saddr, sk->daddr,
4698 sizeof(structtcphdr) + 4, sk);
4699
4700 /*4701 * This must go first otherwise a really quick response will get reset. 4702 */4703
4704 tcp_cache_zap();
4705 tcp_set_state(sk,TCP_SYN_SENT);
4706 if(rt&&rt->rt_flags&RTF_IRTT)
4707 sk->rto = rt->rt_irtt;
4708 else4709 sk->rto = TCP_TIMEOUT_INIT;
4710 sk->retransmit_timer.function=&retransmit_timer;
4711 sk->retransmit_timer.data = (unsignedlong)sk;
4712 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4713 sk->retransmits = 0; /* Now works the right way instead of a hacked 4714 initial setting */4715
4716 sk->prot->queue_xmit(sk, dev, buff, 0);
4717 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4718 tcp_statistics.TcpActiveOpens++;
4719 tcp_statistics.TcpOutSegs++;
4720
4721 release_sock(sk);
4722 return(0);
4723 }4724
4725
4726 /*4727 * This functions checks to see if the tcp header is actually acceptable. 4728 */4729
4730 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4731 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4732 {4733 u32next_seq;
4734
4735 next_seq = len - 4*th->doff;
4736 if (th->fin)
4737 next_seq++;
4738 /* if we have a zero window, we can't have any data in the packet.. */4739 if (next_seq && !sk->window)
4740 gotoignore_it;
4741 next_seq += ntohl(th->seq);
4742
4743 /*4744 * This isn't quite right. sk->acked_seq could be more recent4745 * than sk->window. This is however close enough. We will accept4746 * slightly more packets than we should, but it should not cause4747 * problems unless someone is trying to forge packets.4748 */4749
4750 /* have we already seen all of this packet? */4751 if (!after(next_seq+1, sk->acked_seq))
4752 gotoignore_it;
4753 /* or does it start beyond the window? */4754 if (!before(ntohl(th->seq), sk->acked_seq + sk->window + 1))
4755 gotoignore_it;
4756
4757 /* ok, at least part of this packet would seem interesting.. */4758 return 1;
4759
4760 ignore_it:
4761 if (th->rst)
4762 return 0;
4763
4764 /*4765 * Send a reset if we get something not ours and we are4766 * unsynchronized. Note: We don't do anything to our end. We4767 * are just killing the bogus remote connection then we will4768 * connect again and it will work (with luck).4769 */4770
4771 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4772 {4773 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4774 return 1;
4775 }4776
4777 /* Try to resync things. */4778 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4779 return 0;
4780 }4781
4782 /*4783 * When we get a reset we do this.4784 */4785
4786 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4787 {4788 sk->zapped = 1;
4789 sk->err = ECONNRESET;
4790 if (sk->state == TCP_SYN_SENT)
4791 sk->err = ECONNREFUSED;
4792 if (sk->state == TCP_CLOSE_WAIT)
4793 sk->err = EPIPE;
4794 #ifdef TCP_DO_RFC1337
4795 /*4796 * Time wait assassination protection [RFC1337]4797 */4798 if(sk->state!=TCP_TIME_WAIT)
4799 {4800 tcp_set_state(sk,TCP_CLOSE);
4801 sk->shutdown = SHUTDOWN_MASK;
4802 }4803 #else4804 tcp_set_state(sk,TCP_CLOSE);
4805 sk->shutdown = SHUTDOWN_MASK;
4806 #endif4807 if (!sk->dead)
4808 sk->state_change(sk);
4809 kfree_skb(skb, FREE_READ);
4810 release_sock(sk);
4811 return(0);
4812 }4813
4814 /*4815 * A TCP packet has arrived.4816 * skb->h.raw is the TCP header.4817 */4818
4819 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4820 __u32daddr, unsignedshortlen,
4821 __u32saddr, intredo, structinet_protocol * protocol)
4822 {4823 structtcphdr *th;
4824 structsock *sk;
4825 intsyn_ok=0;
4826
4827 tcp_statistics.TcpInSegs++;
4828 if(skb->pkt_type!=PACKET_HOST)
4829 {4830 kfree_skb(skb,FREE_READ);
4831 return(0);
4832 }4833
4834 th = skb->h.th;
4835
4836 /*4837 * Find the socket, using the last hit cache if applicable.4838 */4839
4840 if(!redo && saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4841 {4842 sk=(structsock *)th_cache_sk;
4843 /*4844 * We think this is causing the bug so4845 */4846 if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4847 printk("Cache mismatch on TCP.\n");
4848 }4849 else4850 {4851 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4852 th_cache_saddr=saddr;
4853 th_cache_daddr=daddr;
4854 th_cache_dport=th->dest;
4855 th_cache_sport=th->source;
4856 th_cache_sk=sk;
4857 }4858
4859 /*4860 * If this socket has got a reset it's to all intents and purposes 4861 * really dead. Count closed sockets as dead.4862 *4863 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4864 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4865 * exist so should cause resets as if the port was unreachable.4866 */4867
4868 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4869 sk=NULL;
4870
4871 if (!redo)
4872 {4873 /*4874 * Pull up the IP header.4875 */4876 skb_pull(skb, skb->h.raw-skb->data);
4877 /*4878 * Try to use the device checksum if provided.4879 */4880 if (
4881 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4882 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4883 )
4884 {4885 skb->sk = NULL;
4886 kfree_skb(skb,FREE_READ);
4887 /*4888 * We don't release the socket because it was4889 * never marked in use.4890 */4891 return(0);
4892 }4893
4894 skb->seq = ntohl(th->seq);
4895 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4896 skb->ack_seq = ntohl(th->ack_seq);
4897
4898 /* See if we know about the socket. */4899 if (sk == NULL)
4900 {4901 /*4902 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4903 */4904 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4905 skb->sk = NULL;
4906 /*4907 * Discard frame4908 */4909 kfree_skb(skb, FREE_READ);
4910 return(0);
4911 }4912
4913 skb->acked = 0;
4914 skb->used = 0;
4915 skb->free = 0;
4916 skb->saddr = daddr;
4917 skb->daddr = saddr;
4918
4919 /* We may need to add it to the backlog here. */4920 cli();
4921 if (sk->inuse)
4922 {4923 skb_queue_tail(&sk->back_log, skb);
4924 sti();
4925 return(0);
4926 }4927 sk->inuse = 1;
4928 sti();
4929 }4930 else4931 {4932 if (sk==NULL)
4933 {4934 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4935 skb->sk = NULL;
4936 kfree_skb(skb, FREE_READ);
4937 return(0);
4938 }4939 }4940
4941
4942 if (!sk->prot)
4943 {4944 printk("IMPOSSIBLE 3\n");
4945 return(0);
4946 }4947
4948
4949 /*4950 * Charge the memory to the socket. 4951 */4952
4953 skb->sk=sk;
4954 sk->rmem_alloc += skb->truesize;
4955
4956 /*4957 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4958 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4959 * compatibility. We also set up variables more thoroughly [Karn notes in the4960 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4961 */4962
4963 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4964 {4965
4966 /*4967 * Now deal with unusual cases.4968 */4969
4970 if(sk->state==TCP_LISTEN)
4971 {4972 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4973 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4974
4975 /*4976 * We don't care for RST, and non SYN are absorbed (old segments)4977 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4978 * netmask on a running connection it can go broadcast. Even Sun's have4979 * this problem so I'm ignoring it 4980 */4981
4982 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4983 {4984 kfree_skb(skb, FREE_READ);
4985 release_sock(sk);
4986 return 0;
4987 }4988
4989 /* 4990 * Guess we need to make a new socket up 4991 */4992
4993 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4994
4995 /*4996 * Now we have several options: In theory there is nothing else4997 * in the frame. KA9Q has an option to send data with the syn,4998 * BSD accepts data with the syn up to the [to be] advertised window4999 * and Solaris 2.1 gives you a protocol error. For now we just ignore5000 * it, that fits the spec precisely and avoids incompatibilities. It5001 * would be nice in future to drop through and process the data.5002 */5003
5004 release_sock(sk);
5005 return 0;
5006 }5007
5008 /* retransmitted SYN? */5009 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
5010 {5011 kfree_skb(skb, FREE_READ);
5012 release_sock(sk);
5013 return 0;
5014 }5015
5016 /*5017 * SYN sent means we have to look for a suitable ack and either reset5018 * for bad matches or go to connected 5019 */5020
5021 if(sk->state==TCP_SYN_SENT)
5022 {5023 /* Crossed SYN or previous junk segment */5024 if(th->ack)
5025 {5026 /* We got an ack, but it's not a good ack */5027 if(!tcp_ack(sk,th,saddr,len))
5028 {5029 /* Reset the ack - its an ack from a 5030 different connection [ th->rst is checked in tcp_reset()] */5031 tcp_statistics.TcpAttemptFails++;
5032 tcp_reset(daddr, saddr, th,
5033 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5034 kfree_skb(skb, FREE_READ);
5035 release_sock(sk);
5036 return(0);
5037 }5038 if(th->rst)
5039 returntcp_std_reset(sk,skb);
5040 if(!th->syn)
5041 {5042 /* A valid ack from a different connection5043 start. Shouldn't happen but cover it */5044 kfree_skb(skb, FREE_READ);
5045 release_sock(sk);
5046 return 0;
5047 }5048 /*5049 * Ok.. it's good. Set up sequence numbers and5050 * move to established.5051 */5052 syn_ok=1; /* Don't reset this connection for the syn */5053 sk->acked_seq = skb->seq+1;
5054 sk->fin_seq = skb->seq;
5055 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5056 tcp_set_state(sk, TCP_ESTABLISHED);
5057 tcp_options(sk,th);
5058 sk->dummy_th.dest=th->source;
5059 sk->copied_seq = sk->acked_seq;
5060 if(!sk->dead)
5061 {5062 sk->state_change(sk);
5063 sock_wake_async(sk->socket, 0);
5064 }5065 if(sk->max_window==0)
5066 {5067 sk->max_window = 32;
5068 sk->mss = min(sk->max_window, sk->mtu);
5069 }5070 }5071 else5072 {5073 /* See if SYN's cross. Drop if boring */5074 if(th->syn && !th->rst)
5075 {5076 /* Crossed SYN's are fine - but talking to5077 yourself is right out... */5078 if(sk->saddr==saddr && sk->daddr==daddr &&
5079 sk->dummy_th.source==th->source &&
5080 sk->dummy_th.dest==th->dest)
5081 {5082 tcp_statistics.TcpAttemptFails++;
5083 returntcp_std_reset(sk,skb);
5084 }5085 tcp_set_state(sk,TCP_SYN_RECV);
5086
5087 /*5088 * FIXME:5089 * Must send SYN|ACK here5090 */5091 }5092 /* Discard junk segment */5093 kfree_skb(skb, FREE_READ);
5094 release_sock(sk);
5095 return 0;
5096 }5097 /*5098 * SYN_RECV with data maybe.. drop through5099 */5100 gotorfc_step6;
5101 }5102
5103 /*5104 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is5105 * a more complex suggestion for fixing these reuse issues in RFC16445106 * but not yet ready for general use. Also see RFC1379.5107 */5108
5109 #defineBSD_TIME_WAIT5110 #ifdefBSD_TIME_WAIT5111 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5112 after(skb->seq, sk->acked_seq) && !th->rst)
5113 {5114 u32seq = sk->write_seq;
5115 if(sk->debug)
5116 printk("Doing a BSD time wait\n");
5117 tcp_statistics.TcpEstabResets++;
5118 sk->rmem_alloc -= skb->truesize;
5119 skb->sk = NULL;
5120 sk->err=ECONNRESET;
5121 tcp_set_state(sk, TCP_CLOSE);
5122 sk->shutdown = SHUTDOWN_MASK;
5123 release_sock(sk);
5124 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5125 if (sk && sk->state==TCP_LISTEN)
5126 {5127 sk->inuse=1;
5128 skb->sk = sk;
5129 sk->rmem_alloc += skb->truesize;
5130 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5131 release_sock(sk);
5132 return 0;
5133 }5134 kfree_skb(skb, FREE_READ);
5135 return 0;
5136 }5137 #endif5138 }5139
5140 /*5141 * We are now in normal data flow (see the step list in the RFC)5142 * Note most of these are inline now. I'll inline the lot when5143 * I have time to test it hard and look at what gcc outputs 5144 */5145
5146 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5147 {5148 kfree_skb(skb, FREE_READ);
5149 release_sock(sk);
5150 return 0;
5151 }5152
5153 if(th->rst)
5154 returntcp_std_reset(sk,skb);
5155
5156 /*5157 * !syn_ok is effectively the state test in RFC793.5158 */5159
5160 if(th->syn && !syn_ok)
5161 {5162 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5163 returntcp_std_reset(sk,skb);
5164 }5165
5166 /*5167 * Process the ACK5168 */5169
5170
5171 if(th->ack && !tcp_ack(sk,th,saddr,len))
5172 {5173 /*5174 * Our three way handshake failed.5175 */5176
5177 if(sk->state==TCP_SYN_RECV)
5178 {5179 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5180 }5181 kfree_skb(skb, FREE_READ);
5182 release_sock(sk);
5183 return 0;
5184 }5185
5186 rfc_step6: /* I'll clean this up later */5187
5188 /*5189 * If the accepted buffer put us over our queue size we5190 * now drop it (we must process the ack first to avoid5191 * deadlock cases).5192 */5193
5194 if (sk->rmem_alloc >= sk->rcvbuf)
5195 {5196 kfree_skb(skb, FREE_READ);
5197 release_sock(sk);
5198 return(0);
5199 }5200
5201
5202 /*5203 * Process urgent data5204 */5205
5206 if(tcp_urg(sk, th, saddr, len))
5207 {5208 kfree_skb(skb, FREE_READ);
5209 release_sock(sk);
5210 return 0;
5211 }5212
5213 /*5214 * Process the encapsulated data5215 */5216
5217 if(tcp_data(skb,sk, saddr, len))
5218 {5219 kfree_skb(skb, FREE_READ);
5220 release_sock(sk);
5221 return 0;
5222 }5223
5224 /*5225 * And done5226 */5227
5228 release_sock(sk);
5229 return 0;
5230 }5231
5232 /*5233 * This routine sends a packet with an out of date sequence5234 * number. It assumes the other end will try to ack it.5235 */5236
5237 staticvoidtcp_write_wakeup(structsock *sk)
/* */5238 {5239 structsk_buff *buff,*skb;
5240 structtcphdr *t1;
5241 structdevice *dev=NULL;
5242 inttmp;
5243
5244 if (sk->zapped)
5245 return; /* After a valid reset we can send no more */5246
5247 /*5248 * Write data can still be transmitted/retransmitted in the5249 * following states. If any other state is encountered, return.5250 * [listen/close will never occur here anyway]5251 */5252
5253 if (sk->state != TCP_ESTABLISHED &&
5254 sk->state != TCP_CLOSE_WAIT &&
5255 sk->state != TCP_FIN_WAIT1 &&
5256 sk->state != TCP_LAST_ACK &&
5257 sk->state != TCP_CLOSING5258 )
5259 {5260 return;
5261 }5262 if ( before(sk->sent_seq, sk->window_seq) &&
5263 (skb=skb_peek(&sk->write_queue)))
5264 {5265 /*5266 * We are probing the opening of a window5267 * but the window size is != 05268 * must have been a result SWS advoidance ( sender )5269 */5270
5271 structiphdr *iph;
5272 structtcphdr *th;
5273 structtcphdr *nth;
5274 unsignedlongwin_size;
5275 #if 0
5276 unsignedlong ow_size;
5277 #endif5278 void * tcp_data_start;
5279
5280 /*5281 * How many bytes can we send ?5282 */5283
5284 win_size = sk->window_seq - sk->sent_seq;
5285
5286 /*5287 * Recover the buffer pointers5288 */5289
5290 iph = (structiphdr *)skb->ip_hdr;
5291 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
5292
5293 /*5294 * Grab the data for a temporary frame5295 */5296
5297 buff = sock_wmalloc(sk, win_size + th->doff * 4 +
5298 (iph->ihl << 2) +
5299 sk->prot->max_header + 15,
5300 1, GFP_ATOMIC);
5301 if ( buff == NULL )
5302 return;
5303
5304 /* 5305 * If we strip the packet on the write queue we must5306 * be ready to retransmit this one 5307 */5308
5309 buff->free = /*0*/1;
5310
5311 buff->sk = sk;
5312 buff->localroute = sk->localroute;
5313
5314 /*5315 * Put headers on the new packet5316 */5317
5318 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5319 IPPROTO_TCP, sk->opt, buff->truesize,
5320 sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5321 if (tmp < 0)
5322 {5323 sock_wfree(sk, buff);
5324 return;
5325 }5326
5327 /*5328 * Move the TCP header over5329 */5330
5331 buff->dev = dev;
5332
5333 nth = (structtcphdr *) skb_put(buff,th->doff*4);
5334
5335 memcpy(nth, th, th->doff * 4);
5336
5337 /*5338 * Correct the new header5339 */5340
5341 nth->ack = 1;
5342 nth->ack_seq = htonl(sk->acked_seq);
5343 nth->window = htons(tcp_select_window(sk));
5344 nth->check = 0;
5345
5346 /*5347 * Find the first data byte.5348 */5349
5350 tcp_data_start = (char *) th + (th->doff << 2);
5351
5352 /*5353 * Add it to our new buffer5354 */5355
5356 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5357
5358 /*5359 * Remember our right edge sequence number.5360 */5361
5362 buff->end_seq = sk->sent_seq + win_size;
5363 sk->sent_seq = buff->end_seq; /* Hack */5364 if(th->urg && ntohs(th->urg_ptr) < win_size)
5365 nth->urg = 0;
5366
5367 /*5368 * Checksum the split buffer5369 */5370
5371 tcp_send_check(nth, sk->saddr, sk->daddr,
5372 nth->doff * 4 + win_size , sk);
5373 }5374 else5375 {5376 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5377 if (buff == NULL)
5378 return;
5379
5380 buff->free = 1;
5381 buff->sk = sk;
5382 buff->localroute = sk->localroute;
5383
5384 /*5385 * Put in the IP header and routing stuff. 5386 */5387
5388 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5389 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5390 if (tmp < 0)
5391 {5392 sock_wfree(sk, buff);
5393 return;
5394 }5395
5396 t1 = (structtcphdr *)skb_put(buff,sizeof(structtcphdr));
5397 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5398
5399 /*5400 * Use a previous sequence.5401 * This should cause the other end to send an ack.5402 */5403
5404 t1->seq = htonl(sk->sent_seq-1);
5405 t1->ack = 1;
5406 t1->res1= 0;
5407 t1->res2= 0;
5408 t1->rst = 0;
5409 t1->urg = 0;
5410 t1->psh = 0;
5411 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5412 t1->syn = 0;
5413 t1->ack_seq = htonl(sk->acked_seq);
5414 t1->window = htons(tcp_select_window(sk));
5415 t1->doff = sizeof(*t1)/4;
5416 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5417
5418 }5419
5420 /*5421 * Send it.5422 */5423
5424 sk->prot->queue_xmit(sk, dev, buff, 1);
5425 tcp_statistics.TcpOutSegs++;
5426 }5427
5428 /*5429 * A window probe timeout has occurred.5430 */5431
5432 voidtcp_send_probe0(structsock *sk)
/* */5433 {5434 if (sk->zapped)
5435 return; /* After a valid reset we can send no more */5436
5437 tcp_write_wakeup(sk);
5438
5439 sk->backoff++;
5440 sk->rto = min(sk->rto << 1, 120*HZ);
5441 sk->retransmits++;
5442 sk->prot->retransmits ++;
5443 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5444 }5445
5446 /*5447 * Socket option code for TCP. 5448 */5449
5450 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5451 {5452 intval,err;
5453
5454 if(level!=SOL_TCP)
5455 returnip_setsockopt(sk,level,optname,optval,optlen);
5456
5457 if (optval == NULL)
5458 return(-EINVAL);
5459
5460 err=verify_area(VERIFY_READ, optval, sizeof(int));
5461 if(err)
5462 returnerr;
5463
5464 val = get_user((int *)optval);
5465
5466 switch(optname)
5467 {5468 caseTCP_MAXSEG:
5469 /*5470 * values greater than interface MTU won't take effect. however at5471 * the point when this call is done we typically don't yet know5472 * which interface is going to be used5473 */5474 if(val<1||val>MAX_WINDOW)
5475 return -EINVAL;
5476 sk->user_mss=val;
5477 return 0;
5478 caseTCP_NODELAY:
5479 sk->nonagle=(val==0)?0:1;
5480 return 0;
5481 default:
5482 return(-ENOPROTOOPT);
5483 }5484 }5485
5486 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5487 {5488 intval,err;
5489
5490 if(level!=SOL_TCP)
5491 returnip_getsockopt(sk,level,optname,optval,optlen);
5492
5493 switch(optname)
5494 {5495 caseTCP_MAXSEG:
5496 val=sk->user_mss;
5497 break;
5498 caseTCP_NODELAY:
5499 val=sk->nonagle;
5500 break;
5501 default:
5502 return(-ENOPROTOOPT);
5503 }5504 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5505 if(err)
5506 returnerr;
5507 put_user(sizeof(int),(int *) optlen);
5508
5509 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5510 if(err)
5511 returnerr;
5512 put_user(val,(int *)optval);
5513
5514 return(0);
5515 }5516
5517
5518 structprototcp_prot = {5519 tcp_close,
5520 ip_build_header,
5521 tcp_connect,
5522 tcp_accept,
5523 ip_queue_xmit,
5524 tcp_retransmit,
5525 tcp_write_wakeup,
5526 tcp_read_wakeup,
5527 tcp_rcv,
5528 tcp_select,
5529 tcp_ioctl,
5530 NULL,
5531 tcp_shutdown,
5532 tcp_setsockopt,
5533 tcp_getsockopt,
5534 tcp_sendmsg,
5535 tcp_recvmsg,
5536 NULL, /* No special bind() */5537 128,
5538 0,
5539 "TCP",
5540 0, 0,
5541 {NULL,}5542 };