1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications. 178 * Alan Cox : rcv_saddr errors. 179 * Alan Cox : Block double connect(). 180 * Alan Cox : Small hooks for enSKIP. 181 * Alexey Kuznetsov: Path MTU discovery. 182 * 183 * 184 * To Fix: 185 * Fast path the code. Two things here - fix the window calculation 186 * so it doesn't iterate over the queue, also spot packets with no funny 187 * options arriving in order and process directly. 188 * 189 * Rewrite output state machine to use a single queue and do low window 190 * situations as per the spec (RFC 1122) 191 * Speed up input assembly algorithm. 192 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 193 * could do with it working on IPv4 194 * User settable/learned rtt/max window/mtu 195 * Cope with MTU/device switches when retransmitting in tcp. 196 * Fix the window handling to use PR's new code. 197 * 198 * Change the fundamental structure to a single send queue maintained 199 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 200 * active routes too]). Cut the queue off in tcp_retransmit/ 201 * tcp_transmit. 202 * Change the receive queue to assemble as it goes. This lets us 203 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 204 * tcp_data/tcp_read as well as the window shrink crud. 205 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 206 * tcp_queue_skb seem obvious routines to extract. 207 * 208 * This program is free software; you can redistribute it and/or 209 * modify it under the terms of the GNU General Public License 210 * as published by the Free Software Foundation; either version 211 * 2 of the License, or(at your option) any later version. 212 * 213 * Description of States: 214 * 215 * TCP_SYN_SENT sent a connection request, waiting for ack 216 * 217 * TCP_SYN_RECV received a connection request, sent ack, 218 * waiting for final ack in three-way handshake. 219 * 220 * TCP_ESTABLISHED connection established 221 * 222 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 223 * transmission of remaining buffered data 224 * 225 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 226 * to shutdown 227 * 228 * TCP_CLOSING both sides have shutdown but we still have 229 * data we have to finish sending 230 * 231 * TCP_TIME_WAIT timeout to catch resent junk before entering 232 * closed, can only be entered from FIN_WAIT2 233 * or CLOSING. Required because the other end 234 * may not have gotten our last ACK causing it 235 * to retransmit the data packet (which we ignore) 236 * 237 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 238 * us to finish writing our data and to shutdown 239 * (we have to close() to move on to LAST_ACK) 240 * 241 * TCP_LAST_ACK out side has shutdown after remote has 242 * shutdown. There may still be data in our 243 * buffer that we have to finish sending 244 * 245 * TCP_CLOSE socket is finished 246 */ 247
248 /* 249 * RFC1122 status: 250 * NOTE: I'm not going to be doing comments in the code for this one except 251 * for violations and the like. tcp.c is just too big... If I say something 252 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 253 * with Alan. -- MS 950903 254 * 255 * Use of PSH (4.2.2.2) 256 * MAY aggregate data sent without the PSH flag. (does) 257 * MAY queue data recieved without the PSH flag. (does) 258 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 259 * MAY implement PSH on send calls. (doesn't, thus:) 260 * MUST NOT buffer data indefinitely (doesn't [1 second]) 261 * MUST set PSH on last segment (does) 262 * MAY pass received PSH to application layer (doesn't) 263 * SHOULD send maximum-sized segment whenever possible. (almost always does) 264 * 265 * Window Size (4.2.2.3, 4.2.2.16) 266 * MUST treat window size as an unsigned number (does) 267 * SHOULD treat window size as a 32-bit number (does not) 268 * MUST NOT shrink window once it is offered (does not normally) 269 * 270 * Urgent Pointer (4.2.2.4) 271 * **MUST point urgent pointer to last byte of urgent data (not right 272 * after). (doesn't, to be like BSD) 273 * MUST inform application layer asynchronously of incoming urgent 274 * data. (does) 275 * MUST provide application with means of determining the amount of 276 * urgent data pending. (does) 277 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 278 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 279 * [Follows BSD 1 byte of urgent data] 280 * 281 * TCP Options (4.2.2.5) 282 * MUST be able to recieve TCP options in any segment. (does) 283 * MUST ignore unsupported options (does) 284 * 285 * Maximum Segment Size Option (4.2.2.6) 286 * MUST implement both sending and receiving MSS. (does) 287 * SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send 288 * it always). (does, even when MSS == 536, which is legal) 289 * MUST assume MSS == 536 if no MSS received at connection setup (does) 290 * MUST calculate "effective send MSS" correctly: 291 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 292 * (does - but allows operator override) 293 * 294 * TCP Checksum (4.2.2.7) 295 * MUST generate and check TCP checksum. (does) 296 * 297 * Initial Sequence Number Selection (4.2.2.8) 298 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 299 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 300 * necessary for 10Mbps networks - and harder than BSD to spoof!) 301 * 302 * Simultaneous Open Attempts (4.2.2.10) 303 * MUST support simultaneous open attempts (does) 304 * 305 * Recovery from Old Duplicate SYN (4.2.2.11) 306 * MUST keep track of active vs. passive open (does) 307 * 308 * RST segment (4.2.2.12) 309 * SHOULD allow an RST segment to contain data (does, but doesn't do 310 * anything with it, which is standard) 311 * 312 * Closing a Connection (4.2.2.13) 313 * MUST inform application of whether connectin was closed by RST or 314 * normal close. (does) 315 * MAY allow "half-duplex" close (treat connection as closed for the 316 * local app, even before handshake is done). (does) 317 * MUST linger in TIME_WAIT for 2 * MSL (does) 318 * 319 * Retransmission Timeout (4.2.2.15) 320 * MUST implement Jacobson's slow start and congestion avoidance 321 * stuff. (does) 322 * 323 * Probing Zero Windows (4.2.2.17) 324 * MUST support probing of zero windows. (does) 325 * MAY keep offered window closed indefinitely. (does) 326 * MUST allow remote window to stay closed indefinitely. (does) 327 * 328 * Passive Open Calls (4.2.2.18) 329 * MUST NOT let new passive open affect other connections. (doesn't) 330 * MUST support passive opens (LISTENs) concurrently. (does) 331 * 332 * Time to Live (4.2.2.19) 333 * MUST make TCP TTL configurable. (does - IP_TTL option) 334 * 335 * Event Processing (4.2.2.20) 336 * SHOULD queue out-of-order segments. (does) 337 * MUST aggregate ACK segments whenever possible. (does but badly) 338 * 339 * Retransmission Timeout Calculation (4.2.3.1) 340 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 341 * calculation. (does, or at least explains them in the comments 8*b) 342 * SHOULD initialize RTO to 0 and RTT to 3. (does) 343 * 344 * When to Send an ACK Segment (4.2.3.2) 345 * SHOULD implement delayed ACK. (does not) 346 * MUST keep ACK delay < 0.5 sec. (N/A) 347 * 348 * When to Send a Window Update (4.2.3.3) 349 * MUST implement receiver-side SWS. (does) 350 * 351 * When to Send Data (4.2.3.4) 352 * MUST implement sender-side SWS. (does - imperfectly) 353 * SHOULD implement Nagle algorithm. (does) 354 * 355 * TCP Connection Failures (4.2.3.5) 356 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 357 * SHOULD inform application layer of soft errors. (doesn't) 358 * 359 * TCP Keep-Alives (4.2.3.6) 360 * MAY provide keep-alives. (does) 361 * MUST make keep-alives configurable on a per-connection basis. (does) 362 * MUST default to no keep-alives. (does) 363 * **MUST make keep-alive interval configurable. (doesn't) 364 * **MUST make default keep-alive interval > 2 hours. (doesn't) 365 * MUST NOT interpret failure to ACK keep-alive packet as dead 366 * connection. (doesn't) 367 * SHOULD send keep-alive with no data. (does) 368 * 369 * TCP Multihoming (4.2.3.7) 370 * MUST get source address from IP layer before sending first 371 * SYN. (does) 372 * MUST use same local address for all segments of a connection. (does) 373 * 374 * IP Options (4.2.3.8) 375 * (I don't think the IP layer sees the IP options, yet.) 376 * MUST ignore unsupported IP options. (does, I guess 8*b) 377 * MAY support Time Stamp and Record Route. (doesn't) 378 * **MUST allow application to specify a source route. (doesn't?) 379 * **MUST allow receieved Source Route option to set route for all future 380 * segments on this connection. (doesn't, not that I think it's a 381 * huge problem) 382 * 383 * ICMP messages (4.2.3.9) 384 * MUST act on ICMP errors. (does) 385 * MUST slow transmission upon receipt of a Source Quench. (does) 386 * MUST NOT abort connection upon receipt of soft Destination 387 * Unreachables (0, 1, 5), Time Exceededs and Parameter 388 * Problems. (doesn't) 389 * SHOULD report soft Destination Unreachables etc. to the 390 * application. (doesn't) 391 * SHOULD abort connection upon receipt of hard Destination Unreachable 392 * messages (2, 3, 4). (does) 393 * 394 * Remote Address Validation (4.2.3.10) 395 * MUST reject as an error OPEN for invalid remote IP address. (does) 396 * MUST ignore SYN with invalid source address. (does) 397 * MUST silently discard incoming SYN for broadcast/multicast 398 * address. (does) 399 * 400 * Asynchronous Reports (4.2.4.1) 401 * **MUST provide mechanism for reporting soft errors to application 402 * layer. (doesn't) 403 * 404 * Type of Service (4.2.4.2) 405 * MUST allow application layer to set Type of Service. (does IP_TOS) 406 * 407 * (Whew. -- MS 950903) 408 **/ 409
410 #include <linux/types.h>
411 #include <linux/sched.h>
412 #include <linux/mm.h>
413 #include <linux/time.h>
414 #include <linux/string.h>
415 #include <linux/config.h>
416 #include <linux/socket.h>
417 #include <linux/sockios.h>
418 #include <linux/termios.h>
419 #include <linux/in.h>
420 #include <linux/fcntl.h>
421 #include <linux/inet.h>
422 #include <linux/netdevice.h>
423 #include <net/snmp.h>
424 #include <net/ip.h>
425 #include <net/protocol.h>
426 #include <net/icmp.h>
427 #include <net/tcp.h>
428 #include <net/arp.h>
429 #include <linux/skbuff.h>
430 #include <net/sock.h>
431 #include <net/route.h>
432 #include <linux/errno.h>
433 #include <linux/timer.h>
434 #include <asm/system.h>
435 #include <asm/segment.h>
436 #include <linux/mm.h>
437 #include <net/checksum.h>
438
439 /* 440 * The MSL timer is the 'normal' timer. 441 */ 442
443 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
444
445 #define SEQ_TICK 3
446 unsignedlongseq_offset;
447 structtcp_mibtcp_statistics;
448
449 /* 450 * Cached last hit socket 451 */ 452
453 volatileunsignedlongth_cache_saddr,th_cache_daddr;
454 volatileunsignedshortth_cache_dport, th_cache_sport;
455 volatilestructsock *th_cache_sk;
456
457 voidtcp_cache_zap(void)
/* */ 458 { 459 unsignedlongflags;
460 save_flags(flags);
461 cli();
462 th_cache_saddr=0;
463 th_cache_daddr=0;
464 th_cache_dport=0;
465 th_cache_sport=0;
466 th_cache_sk=NULL;
467 restore_flags(flags);
468 } 469
470 staticvoidtcp_close(structsock *sk, inttimeout);
471
472
473 /* 474 * The less said about this the better, but it works and will do for 1.2 475 */ 476
477 staticstructwait_queue *master_select_wakeup;
478
479 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 480 { 481 if (a < b)
482 return(a);
483 return(b);
484 } 485
486 #undefSTATE_TRACE 487
488 #ifdefSTATE_TRACE 489 staticchar *statename[]={ 490 "Unused","Established","Syn Sent","Syn Recv",
491 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
492 "Close Wait","Last ACK","Listen","Closing"
493 };
494 #endif 495
496 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 497 { 498 if(sk->state==TCP_ESTABLISHED)
499 tcp_statistics.TcpCurrEstab--;
500 #ifdefSTATE_TRACE 501 if(sk->debug)
502 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
503 #endif 504 /* This is a hack but it doesn't occur often and it's going to 505 be a real to fix nicely */ 506
507 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
508 { 509 wake_up_interruptible(&master_select_wakeup);
510 } 511 sk->state=state;
512 if(state==TCP_ESTABLISHED)
513 tcp_statistics.TcpCurrEstab++;
514 if(sk->state==TCP_CLOSE)
515 tcp_cache_zap();
516 } 517
518 /* 519 * This routine picks a TCP windows for a socket based on 520 * the following constraints 521 * 522 * 1. The window can never be shrunk once it is offered (RFC 793) 523 * 2. We limit memory per socket 524 * 525 * For now we use NET2E3's heuristic of offering half the memory 526 * we have handy. All is not as bad as this seems however because 527 * of two things. Firstly we will bin packets even within the window 528 * in order to get the data we are waiting for into the memory limit. 529 * Secondly we bin common duplicate forms at receive time 530 * Better heuristics welcome 531 */ 532
533 inttcp_select_window(structsock *sk)
/* */ 534 { 535 intnew_window = sock_rspace(sk);
536
537 if(sk->window_clamp)
538 new_window=min(sk->window_clamp,new_window);
539 /* 540 * Two things are going on here. First, we don't ever offer a 541 * window less than min(sk->mss, MAX_WINDOW/2). This is the 542 * receiver side of SWS as specified in RFC1122. 543 * Second, we always give them at least the window they 544 * had before, in order to avoid retracting window. This 545 * is technically allowed, but RFC1122 advises against it and 546 * in practice it causes trouble. 547 * 548 * Fixme: This doesn't correctly handle the case where 549 * new_window > sk->window but not by enough to allow for the 550 * shift in sequence space. 551 */ 552 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
553 return(sk->window);
554 return(new_window);
555 } 556
557 /* 558 * Find someone to 'accept'. Must be called with 559 * sk->inuse=1 or cli() 560 */ 561
562 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 563 { 564 structsk_buff *p=skb_peek(&s->receive_queue);
565 if(p==NULL)
566 returnNULL;
567 do 568 { 569 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
570 returnp;
571 p=p->next;
572 } 573 while(p!=(structsk_buff *)&s->receive_queue);
574 returnNULL;
575 } 576
577 /* 578 * Remove a completed connection and return it. This is used by 579 * tcp_accept() to get connections from the queue. 580 */ 581
582 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 583 { 584 structsk_buff *skb;
585 unsignedlongflags;
586 save_flags(flags);
587 cli();
588 skb=tcp_find_established(s);
589 if(skb!=NULL)
590 skb_unlink(skb); /* Take it off the queue */ 591 restore_flags(flags);
592 returnskb;
593 } 594
595 /* 596 * This routine closes sockets which have been at least partially 597 * opened, but not yet accepted. Currently it is only called by 598 * tcp_close, and timeout mirrors the value there. 599 */ 600
601 staticvoidtcp_close_pending (structsock *sk)
/* */ 602 { 603 structsk_buff *skb;
604
605 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
606 { 607 skb->sk->dead=1;
608 tcp_close(skb->sk, 0);
609 kfree_skb(skb, FREE_READ);
610 } 611 return;
612 } 613
614 /* 615 * Enter the time wait state. 616 */ 617
618 staticvoidtcp_time_wait(structsock *sk)
/* */ 619 { 620 tcp_set_state(sk,TCP_TIME_WAIT);
621 sk->shutdown = SHUTDOWN_MASK;
622 if (!sk->dead)
623 sk->state_change(sk);
624 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
625 } 626
627 /* 628 * A socket has timed out on its send queue and wants to do a 629 * little retransmitting. Currently this means TCP. 630 */ 631
632 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 633 { 634 structsk_buff * skb;
635 structproto *prot;
636 structdevice *dev;
637 intct=0;
638 structrtable *rt;
639
640 prot = sk->prot;
641 skb = sk->send_head;
642
643 while (skb != NULL)
644 { 645 structtcphdr *th;
646 structiphdr *iph;
647 intsize;
648
649 dev = skb->dev;
650 IS_SKB(skb);
651 skb->when = jiffies;
652
653 /* 654 * Discard the surplus MAC header 655 */ 656
657 skb_pull(skb,((unsignedchar *)skb->ip_hdr)-skb->data);
658
659 /* 660 * In general it's OK just to use the old packet. However we 661 * need to use the current ack and window fields. Urg and 662 * urg_ptr could possibly stand to be updated as well, but we 663 * don't keep the necessary data. That shouldn't be a problem, 664 * if the other end is doing the right thing. Since we're 665 * changing the packet, we have to issue a new IP identifier. 666 */ 667
668 iph = (structiphdr *)skb->data;
669 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
670 size = ntohs(iph->tot_len) - (iph->ihl<<2);
671
672 /* 673 * Note: We ought to check for window limits here but 674 * currently this is done (less efficiently) elsewhere. 675 */ 676
677 /* 678 * Put a MAC header back on (may cause ARPing) 679 */ 680
681 { 682 /* ANK: UGLY, but the bug, that was here, should be fixed. 683 */ 684 structoptions * opt = (structoptions*)skb->proto_priv;
685 rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
686 } 687
688 iph->id = htons(ip_id_count++);
689 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 690 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
691 iph->frag_off &= ~htons(IP_DF);
692 #endif 693 ip_send_check(iph);
694
695 if (rt==NULL) /* Deep poo */ 696 { 697 if(skb->sk)
698 { 699 skb->sk->err=ENETUNREACH;
700 skb->sk->error_report(skb->sk);
701 } 702 } 703 else 704 { 705 dev=rt->rt_dev;
706 skb->raddr=rt->rt_gateway;
707 skb->dev=dev;
708 skb->arp=1;
709 if (rt->rt_hh)
710 { 711 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
712 if (!rt->rt_hh->hh_uptodate)
713 { 714 skb->arp = 0;
715 #ifRT_CACHE_DEBUG >= 2
716 printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
717 #endif 718 } 719 } 720 elseif (dev->hard_header)
721 { 722 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
723 skb->arp=0;
724 } 725
726 /* 727 * This is not the right way to handle this. We have to 728 * issue an up to date window and ack report with this 729 * retransmit to keep the odd buggy tcp that relies on 730 * the fact BSD does this happy. 731 * We don't however need to recalculate the entire 732 * checksum, so someone wanting a small problem to play 733 * with might like to implement RFC1141/RFC1624 and speed 734 * this up by avoiding a full checksum. 735 */ 736
737 th->ack_seq = ntohl(sk->acked_seq);
738 th->window = ntohs(tcp_select_window(sk));
739 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
740
741 /* 742 * If the interface is (still) up and running, kick it. 743 */ 744
745 if (dev->flags & IFF_UP)
746 { 747 /* 748 * If the packet is still being sent by the device/protocol 749 * below then don't retransmit. This is both needed, and good - 750 * especially with connected mode AX.25 where it stops resends 751 * occurring of an as yet unsent anyway frame! 752 * We still add up the counts as the round trip time wants 753 * adjusting. 754 */ 755 if (sk && !skb_device_locked(skb))
756 { 757 /* Remove it from any existing driver queue first! */ 758 skb_unlink(skb);
759 /* Now queue it */ 760 ip_statistics.IpOutRequests++;
761 dev_queue_xmit(skb, dev, sk->priority);
762 } 763 } 764 } 765
766 /* 767 * Count retransmissions 768 */ 769
770 ct++;
771 sk->prot->retransmits ++;
772 tcp_statistics.TcpRetransSegs++;
773
774
775 /* 776 * Only one retransmit requested. 777 */ 778
779 if (!all)
780 break;
781
782 /* 783 * This should cut it off before we send too many packets. 784 */ 785
786 if (ct >= sk->cong_window)
787 break;
788 skb = skb->link3;
789 } 790 } 791
792 /* 793 * Reset the retransmission timer 794 */ 795
796 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 797 { 798 del_timer(&sk->retransmit_timer);
799 sk->ip_xmit_timeout = why;
800 if((int)when < 0)
801 { 802 when=3;
803 printk("Error: Negative timer in xmit_timer\n");
804 } 805 sk->retransmit_timer.expires=jiffies+when;
806 add_timer(&sk->retransmit_timer);
807 } 808
809 /* 810 * This is the normal code called for timeouts. It does the retransmission 811 * and then does backoff. tcp_do_retransmit is separated out because 812 * tcp_ack needs to send stuff from the retransmit queue without 813 * initiating a backoff. 814 */ 815
816
817 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 818 { 819 tcp_do_retransmit(sk, all);
820
821 /* 822 * Increase the timeout each time we retransmit. Note that 823 * we do not increase the rtt estimate. rto is initialized 824 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 825 * that doubling rto each time is the least we can get away with. 826 * In KA9Q, Karn uses this for the first few times, and then 827 * goes to quadratic. netBSD doubles, but only goes up to *64, 828 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 829 * defined in the protocol as the maximum possible RTT. I guess 830 * we'll have to use something other than TCP to talk to the 831 * University of Mars. 832 * 833 * PAWS allows us longer timeouts and large windows, so once 834 * implemented ftp to mars will work nicely. We will have to fix 835 * the 120 second clamps though! 836 */ 837
838 sk->retransmits++;
839 sk->prot->retransmits++;
840 sk->backoff++;
841 sk->rto = min(sk->rto << 1, 120*HZ);
842 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
843 } 844
845
846 /* 847 * A timer event has trigger a tcp retransmit timeout. The 848 * socket xmit queue is ready and set up to send. Because 849 * the ack receive code keeps the queue straight we do 850 * nothing clever here. 851 */ 852
853 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 854 { 855 if (all)
856 { 857 tcp_retransmit_time(sk, all);
858 return;
859 } 860
861 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 862 /* sk->ssthresh in theory can be zero. I guess that's OK */ 863 sk->cong_count = 0;
864
865 sk->cong_window = 1;
866
867 /* Do the actual retransmit. */ 868 tcp_retransmit_time(sk, all);
869 } 870
871 /* 872 * A write timeout has occurred. Process the after effects. 873 */ 874
875 staticinttcp_write_timeout(structsock *sk)
/* */ 876 { 877 /* 878 * Look for a 'soft' timeout. 879 */ 880 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
881 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
882 { 883 /* 884 * Attempt to recover if arp has changed (unlikely!) or 885 * a route has shifted (not supported prior to 1.3). 886 */ 887 ip_rt_advice(&sk->ip_route_cache, 0);
888 } 889
890 /* 891 * Have we tried to SYN too many times (repent repent 8)) 892 */ 893
894 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
895 { 896 sk->err=ETIMEDOUT;
897 sk->error_report(sk);
898 del_timer(&sk->retransmit_timer);
899 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ 900 tcp_set_state(sk,TCP_CLOSE);
901 /* Don't FIN, we got nothing back */ 902 release_sock(sk);
903 return 0;
904 } 905 /* 906 * Has it gone just too far ? 907 */ 908 if (sk->retransmits > TCP_RETR2)
909 { 910 sk->err = ETIMEDOUT;
911 sk->error_report(sk);
912 del_timer(&sk->retransmit_timer);
913 /* 914 * Time wait the socket 915 */ 916 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
917 { 918 tcp_set_state(sk,TCP_TIME_WAIT);
919 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
920 } 921 else 922 { 923 /* 924 * Clean up time. 925 */ 926 tcp_set_state(sk, TCP_CLOSE);
927 release_sock(sk);
928 return 0;
929 } 930 } 931 return 1;
932 } 933
934 /* 935 * The TCP retransmit timer. This lacks a few small details. 936 * 937 * 1. An initial rtt timeout on the probe0 should cause what we can 938 * of the first write queue buffer to be split and sent. 939 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 940 * ETIMEDOUT if we know an additional 'soft' error caused this. 941 * tcp_err should save a 'soft error' for us. 942 */ 943
944 staticvoidretransmit_timer(unsignedlongdata)
/* */ 945 { 946 structsock *sk = (structsock*)data;
947 intwhy = sk->ip_xmit_timeout;
948
949 /* 950 * only process if socket is not in use 951 */ 952
953 cli();
954 if (sk->inuse || in_bh)
955 { 956 /* Try again in 1 second */ 957 sk->retransmit_timer.expires = jiffies+HZ;
958 add_timer(&sk->retransmit_timer);
959 sti();
960 return;
961 } 962
963 sk->inuse = 1;
964 sti();
965
966 /* Always see if we need to send an ack. */ 967
968 if (sk->ack_backlog && !sk->zapped)
969 { 970 sk->prot->read_wakeup (sk);
971 if (! sk->dead)
972 sk->data_ready(sk,0);
973 } 974
975 /* Now we need to figure out why the socket was on the timer. */ 976
977 switch (why)
978 { 979 /* Window probing */ 980 caseTIME_PROBE0:
981 tcp_send_probe0(sk);
982 tcp_write_timeout(sk);
983 break;
984 /* Retransmitting */ 985 caseTIME_WRITE:
986 /* It could be we got here because we needed to send an ack. 987 * So we need to check for that. 988 */ 989 { 990 structsk_buff *skb;
991 unsignedlongflags;
992
993 save_flags(flags);
994 cli();
995 skb = sk->send_head;
996 if (!skb)
997 { 998 restore_flags(flags);
999 }1000 else1001 {1002 /*1003 * Kicked by a delayed ack. Reset timer1004 * correctly now1005 */1006 if (jiffies < skb->when + sk->rto)
1007 {1008 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1009 restore_flags(flags);
1010 break;
1011 }1012 restore_flags(flags);
1013 /*1014 * Retransmission1015 */1016 sk->retransmits++;
1017 sk->prot->retransmits++;
1018 sk->prot->retransmit (sk, 0);
1019 tcp_write_timeout(sk);
1020 }1021 break;
1022 }1023 /* Sending Keepalives */1024 caseTIME_KEEPOPEN:
1025 /* 1026 * this reset_timer() call is a hack, this is not1027 * how KEEPOPEN is supposed to work.1028 */1029 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1030
1031 /* Send something to keep the connection open. */1032 if (sk->prot->write_wakeup)
1033 sk->prot->write_wakeup (sk);
1034 sk->retransmits++;
1035 sk->prot->retransmits++;
1036 tcp_write_timeout(sk);
1037 break;
1038 default:
1039 printk ("rexmit_timer: timer expired - reason unknown\n");
1040 break;
1041 }1042 release_sock(sk);
1043 }1044
1045 /*1046 * This routine is called by the ICMP module when it gets some1047 * sort of error condition. If err < 0 then the socket should1048 * be closed and the error returned to the user. If err > 01049 * it's just the icmp type << 8 | icmp code. After adjustment1050 * header points to the first 8 bytes of the tcp header. We need1051 * to find the appropriate port.1052 */1053
1054 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */1055 __u32saddr, structinet_protocol *protocol)
1056 {1057 structtcphdr *th = (structtcphdr *)header;
1058 structsock *sk;
1059
1060 /*1061 * This one is _WRONG_. FIXME urgently.1062 */1063 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1064 structiphdr *iph=(structiphdr *)(header-sizeof(structiphdr));
1065 #endif1066 th =(structtcphdr *)header;
1067 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1068
1069 if (sk == NULL)
1070 return;
1071
1072 if (type == ICMP_SOURCE_QUENCH)
1073 {1074 /*1075 * FIXME:1076 * For now we will just trigger a linear backoff.1077 * The slow start code should cause a real backoff here.1078 */1079 if (sk->cong_window > 4)
1080 sk->cong_window--;
1081 return;
1082 }1083
1084 if (type == ICMP_PARAMETERPROB)
1085 {1086 sk->err=EPROTO;
1087 sk->error_report(sk);
1088 }1089
1090 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1091 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1092 {1093 structrtable * rt;
1094 /*1095 * Ugly trick to pass MTU to protocol layer.1096 * Really we should add argument "info" to error handler.1097 */1098 unsignedshortnew_mtu = ntohs(iph->id);
1099
1100 if ((rt = sk->ip_route_cache) != NULL)
1101 if (rt->rt_mtu > new_mtu)
1102 rt->rt_mtu = new_mtu;
1103
1104 if (sk->mtu > new_mtu - sizeof(structiphdr) - sizeof(structtcphdr))
1105 sk->mtu = new_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
1106
1107 return;
1108 }1109 #endif1110
1111 /*1112 * If we've already connected we will keep trying1113 * until we time out, or the user gives up.1114 */1115
1116 if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1117 {1118 sk->err = icmp_err_convert[code].errno;
1119 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1120 {1121 tcp_statistics.TcpAttemptFails++;
1122 tcp_set_state(sk,TCP_CLOSE);
1123 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */1124 }1125 }1126 return;
1127 }1128
1129
1130 /*1131 * Walk down the receive queue counting readable data until we hit the end or we find a gap1132 * in the received data queue (ie a frame missing that needs sending to us). Not1133 * sorting using two queues as data arrives makes life so much harder.1134 */1135
1136 staticinttcp_readable(structsock *sk)
/* */1137 {1138 unsignedlongcounted;
1139 unsignedlongamount;
1140 structsk_buff *skb;
1141 intsum;
1142 unsignedlongflags;
1143
1144 if(sk && sk->debug)
1145 printk("tcp_readable: %p - ",sk);
1146
1147 save_flags(flags);
1148 cli();
1149 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1150 {1151 restore_flags(flags);
1152 if(sk && sk->debug)
1153 printk("empty\n");
1154 return(0);
1155 }1156
1157 counted = sk->copied_seq; /* Where we are at the moment */1158 amount = 0;
1159
1160 /* 1161 * Do until a push or until we are out of data. 1162 */1163
1164 do1165 {1166 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */1167 break;
1168 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */1169 if (skb->h.th->syn)
1170 sum++;
1171 if (sum > 0)
1172 {/* Add it up, move on */1173 amount += sum;
1174 if (skb->h.th->syn)
1175 amount--;
1176 counted += sum;
1177 }1178 /*1179 * Don't count urg data ... but do it in the right place!1180 * Consider: "old_data (ptr is here) URG PUSH data"1181 * The old code would stop at the first push because1182 * it counted the urg (amount==1) and then does amount--1183 * *after* the loop. This means tcp_readable() always1184 * returned zero if any URG PUSH was in the queue, even1185 * though there was normal data available. If we subtract1186 * the urg data right here, we even get it to work for more1187 * than one URG PUSH skb without normal data.1188 * This means that select() finally works now with urg data1189 * in the queue. Note that rlogin was never affected1190 * because it doesn't use select(); it uses two processes1191 * and a blocking read(). And the queue scan in tcp_read()1192 * was correct. Mike <pall@rz.uni-karlsruhe.de>1193 */1194 if (skb->h.th->urg)
1195 amount--; /* don't count urg data */1196 if (amount && skb->h.th->psh) break;
1197 skb = skb->next;
1198 }1199 while(skb != (structsk_buff *)&sk->receive_queue);
1200
1201 restore_flags(flags);
1202 if(sk->debug)
1203 printk("got %lu bytes.\n",amount);
1204 return(amount);
1205 }1206
1207 /*1208 * LISTEN is a special case for select..1209 */1210 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */1211 {1212 if (sel_type == SEL_IN) {1213 intretval;
1214
1215 sk->inuse = 1;
1216 retval = (tcp_find_established(sk) != NULL);
1217 release_sock(sk);
1218 if (!retval)
1219 select_wait(&master_select_wakeup,wait);
1220 returnretval;
1221 }1222 return 0;
1223 }1224
1225
1226 /*1227 * Wait for a TCP event.1228 *1229 * Note that we don't need to set "sk->inuse", as the upper select layers1230 * take care of normal races (between the test and the event) and we don't1231 * go look at any of the socket buffers directly.1232 */1233 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */1234 {1235 if (sk->state == TCP_LISTEN)
1236 returntcp_listen_select(sk, sel_type, wait);
1237
1238 switch(sel_type) {1239 caseSEL_IN:
1240 if (sk->err)
1241 return 1;
1242 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1243 break;
1244
1245 if (sk->shutdown & RCV_SHUTDOWN)
1246 return 1;
1247
1248 if (sk->acked_seq == sk->copied_seq)
1249 break;
1250
1251 if (sk->urg_seq != sk->copied_seq ||
1252 sk->acked_seq != sk->copied_seq+1 ||
1253 sk->urginline || !sk->urg_data)
1254 return 1;
1255 break;
1256
1257 caseSEL_OUT:
1258 if (sk->err)
1259 return 1;
1260 if (sk->shutdown & SEND_SHUTDOWN)
1261 return 0;
1262 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1263 break;
1264 /*1265 * This is now right thanks to a small fix1266 * by Matt Dillon.1267 */1268
1269 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1270 break;
1271 return 1;
1272
1273 caseSEL_EX:
1274 if (sk->urg_data)
1275 return 1;
1276 break;
1277 }1278 select_wait(sk->sleep, wait);
1279 return 0;
1280 }1281
1282 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */1283 {1284 interr;
1285 switch(cmd)
1286 {1287
1288 caseTIOCINQ:
1289 #ifdef FIXME /* FIXME: */1290 caseFIONREAD:
1291 #endif1292 {1293 unsignedlongamount;
1294
1295 if (sk->state == TCP_LISTEN)
1296 return(-EINVAL);
1297
1298 sk->inuse = 1;
1299 amount = tcp_readable(sk);
1300 release_sock(sk);
1301 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1302 if(err)
1303 returnerr;
1304 put_user(amount, (int *)arg);
1305 return(0);
1306 }1307 caseSIOCATMARK:
1308 {1309 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1310
1311 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1312 if (err)
1313 returnerr;
1314 put_user(answ,(int *) arg);
1315 return(0);
1316 }1317 caseTIOCOUTQ:
1318 {1319 unsignedlongamount;
1320
1321 if (sk->state == TCP_LISTEN) return(-EINVAL);
1322 amount = sock_wspace(sk);
1323 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1324 if(err)
1325 returnerr;
1326 put_user(amount, (int *)arg);
1327 return(0);
1328 }1329 default:
1330 return(-EINVAL);
1331 }1332 }1333
1334
1335 /*1336 * This routine computes a TCP checksum. 1337 *1338 * Modified January 1995 from a go-faster DOS routine by1339 * Jorge Cwik <jorge@laser.satlink.net>1340 */1341
1342 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1343 unsignedlongsaddr, unsignedlongdaddr, unsignedlongbase)
1344 {1345 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1346 }1347
1348
1349
1350 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1351 unsignedlongdaddr, intlen, structsock *sk)
1352 {1353 th->check = 0;
1354 th->check = tcp_check(th, len, saddr, daddr,
1355 csum_partial((char *)th,len,0));
1356 return;
1357 }1358
1359 /*1360 * This is the main buffer sending routine. We queue the buffer1361 * having checked it is sane seeming.1362 */1363
1364 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1365 {1366 intsize;
1367 structtcphdr * th = skb->h.th;
1368
1369 /*1370 * length of packet (not counting length of pre-tcp headers) 1371 */1372
1373 size = skb->len - ((unsignedchar *) th - skb->data);
1374
1375 /*1376 * Sanity check it.. 1377 */1378
1379 if (size < sizeof(structtcphdr) || size > skb->len)
1380 {1381 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1382 skb, skb->data, th, skb->len);
1383 kfree_skb(skb, FREE_WRITE);
1384 return;
1385 }1386
1387 /*1388 * If we have queued a header size packet.. (these crash a few1389 * tcp stacks if ack is not set)1390 */1391
1392 if (size == sizeof(structtcphdr))
1393 {1394 /* If it's got a syn or fin it's notionally included in the size..*/1395 if(!th->syn && !th->fin)
1396 {1397 printk("tcp_send_skb: attempt to queue a bogon.\n");
1398 kfree_skb(skb,FREE_WRITE);
1399 return;
1400 }1401 }1402
1403 /*1404 * Actual processing.1405 */1406
1407 tcp_statistics.TcpOutSegs++;
1408 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1409
1410 /*1411 * We must queue if1412 *1413 * a) The right edge of this frame exceeds the window1414 * b) We are retransmitting (Nagle's rule)1415 * c) We have too many packets 'in flight'1416 */1417
1418 if (after(skb->h.seq, sk->window_seq) ||
1419 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1420 sk->packets_out >= sk->cong_window)
1421 {1422 /* checksum will be supplied by tcp_write_xmit. So1423 * we shouldn't need to set it at all. I'm being paranoid */1424 th->check = 0;
1425 if (skb->next != NULL)
1426 {1427 printk("tcp_send_partial: next != NULL\n");
1428 skb_unlink(skb);
1429 }1430 skb_queue_tail(&sk->write_queue, skb);
1431
1432 /*1433 * If we don't fit we have to start the zero window1434 * probes. This is broken - we really need to do a partial1435 * send _first_ (This is what causes the Cisco and PC/TCP1436 * grief).1437 */1438
1439 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1440 sk->send_head == NULL && sk->ack_backlog == 0)
1441 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1442 }1443 else1444 {1445 /*1446 * This is going straight out1447 */1448
1449 th->ack_seq = ntohl(sk->acked_seq);
1450 th->window = ntohs(tcp_select_window(sk));
1451
1452 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1453
1454 sk->sent_seq = sk->write_seq;
1455
1456 /*1457 * This is mad. The tcp retransmit queue is put together1458 * by the ip layer. This causes half the problems with1459 * unroutable FIN's and other things.1460 */1461
1462 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1463
1464 /*1465 * Set for next retransmit based on expected ACK time.1466 * FIXME: We set this every time which means our 1467 * retransmits are really about a window behind.1468 */1469
1470 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1471 }1472 }1473
1474 /*1475 * Locking problems lead us to a messy situation where we can have1476 * multiple partially complete buffers queued up. This is really bad1477 * as we don't want to be sending partial buffers. Fix this with1478 * a semaphore or similar to lock tcp_write per socket.1479 *1480 * These routines are pretty self descriptive.1481 */1482
1483 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1484 {1485 structsk_buff * skb;
1486 unsignedlongflags;
1487
1488 save_flags(flags);
1489 cli();
1490 skb = sk->partial;
1491 if (skb) {1492 sk->partial = NULL;
1493 del_timer(&sk->partial_timer);
1494 }1495 restore_flags(flags);
1496 returnskb;
1497 }1498
1499 /*1500 * Empty the partial queue1501 */1502
1503 staticvoidtcp_send_partial(structsock *sk)
/* */1504 {1505 structsk_buff *skb;
1506
1507 if (sk == NULL)
1508 return;
1509 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1510 tcp_send_skb(sk, skb);
1511 }1512
1513 /*1514 * Queue a partial frame1515 */1516
1517 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1518 {1519 structsk_buff * tmp;
1520 unsignedlongflags;
1521
1522 save_flags(flags);
1523 cli();
1524 tmp = sk->partial;
1525 if (tmp)
1526 del_timer(&sk->partial_timer);
1527 sk->partial = skb;
1528 init_timer(&sk->partial_timer);
1529 /*1530 * Wait up to 1 second for the buffer to fill.1531 */1532 sk->partial_timer.expires = jiffies+HZ;
1533 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1534 sk->partial_timer.data = (unsignedlong) sk;
1535 add_timer(&sk->partial_timer);
1536 restore_flags(flags);
1537 if (tmp)
1538 tcp_send_skb(sk, tmp);
1539 }1540
1541
1542 /*1543 * This routine sends an ack and also updates the window. 1544 */1545
1546 staticvoidtcp_send_ack(u32sequence, u32ack,
/* */1547 structsock *sk,
1548 structtcphdr *th, unsignedlongdaddr)
1549 {1550 structsk_buff *buff;
1551 structtcphdr *t1;
1552 structdevice *dev = NULL;
1553 inttmp;
1554
1555 if(sk->zapped)
1556 return; /* We have been reset, we may not send again */1557
1558 /*1559 * We need to grab some memory, and put together an ack,1560 * and then put it into the queue to be sent.1561 */1562
1563 buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1564 if (buff == NULL)
1565 {1566 /* 1567 * Force it to send an ack. We don't have to do this1568 * (ACK is unreliable) but it's much better use of 1569 * bandwidth on slow links to send a spare ack than1570 * resend packets. 1571 */1572
1573 sk->ack_backlog++;
1574 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1575 {1576 reset_xmit_timer(sk, TIME_WRITE, HZ);
1577 }1578 return;
1579 }1580
1581 /*1582 * Assemble a suitable TCP frame1583 */1584
1585 buff->sk = sk;
1586 buff->localroute = sk->localroute;
1587
1588 /* 1589 * Put in the IP header and routing stuff. 1590 */1591
1592 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1593 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1594 if (tmp < 0)
1595 {1596 buff->free = 1;
1597 sock_wfree(sk, buff);
1598 return;
1599 }1600 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1601
1602 memcpy(t1, th, sizeof(*t1));
1603
1604 /*1605 * Swap the send and the receive. 1606 */1607
1608 t1->dest = th->source;
1609 t1->source = th->dest;
1610 t1->seq = ntohl(sequence);
1611 t1->ack = 1;
1612 sk->window = tcp_select_window(sk);
1613 t1->window = ntohs(sk->window);
1614 t1->res1 = 0;
1615 t1->res2 = 0;
1616 t1->rst = 0;
1617 t1->urg = 0;
1618 t1->syn = 0;
1619 t1->psh = 0;
1620 t1->fin = 0;
1621
1622 /*1623 * If we have nothing queued for transmit and the transmit timer1624 * is on we are just doing an ACK timeout and need to switch1625 * to a keepalive.1626 */1627
1628 if (ack == sk->acked_seq)
1629 {1630 sk->ack_backlog = 0;
1631 sk->bytes_rcv = 0;
1632 sk->ack_timed = 0;
1633 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1634 && sk->ip_xmit_timeout == TIME_WRITE)
1635 {1636 if(sk->keepopen) {1637 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1638 }else{1639 delete_timer(sk);
1640 }1641 }1642 }1643
1644 /*1645 * Fill in the packet and send it1646 */1647
1648 t1->ack_seq = ntohl(ack);
1649 t1->doff = sizeof(*t1)/4;
1650 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1651 if (sk->debug)
1652 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1653 tcp_statistics.TcpOutSegs++;
1654 sk->prot->queue_xmit(sk, dev, buff, 1);
1655 }1656
1657
1658 /* 1659 * This routine builds a generic TCP header. 1660 */1661
1662 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1663 {1664
1665 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1666 th->seq = htonl(sk->write_seq);
1667 th->psh =(push == 0) ? 1 : 0;
1668 th->doff = sizeof(*th)/4;
1669 th->ack = 1;
1670 th->fin = 0;
1671 sk->ack_backlog = 0;
1672 sk->bytes_rcv = 0;
1673 sk->ack_timed = 0;
1674 th->ack_seq = htonl(sk->acked_seq);
1675 sk->window = tcp_select_window(sk);
1676 th->window = htons(sk->window);
1677
1678 return(sizeof(*th));
1679 }1680
1681 /*1682 * This routine copies from a user buffer into a socket,1683 * and starts the transmit system.1684 */1685
1686 staticinttcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */1687 intlen, intnonblock, intflags)
1688 {1689 intcopied = 0;
1690 intcopy;
1691 inttmp;
1692 intseglen;
1693 intiovct=0;
1694 structsk_buff *skb;
1695 structsk_buff *send_tmp;
1696 structproto *prot;
1697 structdevice *dev = NULL;
1698 unsignedchar *from;
1699
1700 /*1701 * Do sanity checking for sendmsg/sendto/send1702 */1703
1704 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1705 return -EINVAL;
1706 if (msg->msg_name)
1707 {1708 structsockaddr_in *addr=(structsockaddr_in *)msg->msg_name;
1709 if(sk->state == TCP_CLOSE)
1710 return -ENOTCONN;
1711 if (msg->msg_namelen < sizeof(*addr))
1712 return -EINVAL;
1713 if (addr->sin_family && addr->sin_family != AF_INET)
1714 return -EINVAL;
1715 if (addr->sin_port != sk->dummy_th.dest)
1716 return -EISCONN;
1717 if (addr->sin_addr.s_addr != sk->daddr)
1718 return -EISCONN;
1719 }1720
1721 /*1722 * Ok commence sending1723 */1724
1725 while(iovct<msg->msg_iovlen)
1726 {1727 seglen=msg->msg_iov[iovct].iov_len;
1728 from=msg->msg_iov[iovct++].iov_base;
1729 sk->inuse=1;
1730 prot = sk->prot;
1731 while(seglen > 0)
1732 {1733 if (sk->err)
1734 {/* Stop on an error */1735 release_sock(sk);
1736 if (copied)
1737 return(copied);
1738 returnsock_error(sk);
1739 }1740
1741 /*1742 * First thing we do is make sure that we are established. 1743 */1744
1745 if (sk->shutdown & SEND_SHUTDOWN)
1746 {1747 release_sock(sk);
1748 sk->err = EPIPE;
1749 if (copied)
1750 return(copied);
1751 sk->err = 0;
1752 return(-EPIPE);
1753 }1754
1755 /* 1756 * Wait for a connection to finish.1757 */1758
1759 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1760 {1761 if (sk->err)
1762 {1763 release_sock(sk);
1764 if (copied)
1765 return(copied);
1766 returnsock_error(sk);
1767 }1768
1769 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1770 {1771 release_sock(sk);
1772 if (copied)
1773 return(copied);
1774
1775 if (sk->err)
1776 returnsock_error(sk);
1777
1778 if (sk->keepopen)
1779 {1780 send_sig(SIGPIPE, current, 0);
1781 }1782 return(-EPIPE);
1783 }1784
1785 if (nonblock || copied)
1786 {1787 release_sock(sk);
1788 if (copied)
1789 return(copied);
1790 return(-EAGAIN);
1791 }1792
1793 release_sock(sk);
1794 cli();
1795
1796 if (sk->state != TCP_ESTABLISHED &&
1797 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1798 {1799 interruptible_sleep_on(sk->sleep);
1800 if (current->signal & ~current->blocked)
1801 {1802 sti();
1803 if (copied)
1804 return(copied);
1805 return(-ERESTARTSYS);
1806 }1807 }1808 sk->inuse = 1;
1809 sti();
1810 }1811
1812 /*1813 * The following code can result in copy <= if sk->mss is ever1814 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1815 * sk->mtu is constant once SYN processing is finished. I.e. we1816 * had better not get here until we've seen his SYN and at least one1817 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1818 * But ESTABLISHED should guarantee that. sk->max_window is by definition1819 * non-decreasing. Note that any ioctl to set user_mss must be done1820 * before the exchange of SYN's. If the initial ack from the other1821 * end has a window of 0, max_window and thus mss will both be 0.1822 */1823
1824 /* 1825 * Now we need to check if we have a half built packet. 1826 */1827 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1828 /*1829 * FIXME: I'm almost sure that this fragment is BUG,1830 * but it works... I do not know why 8) --ANK1831 *1832 * Really, we should rebuild all the queues...1833 * It's difficult. Temprorary hack is to send all1834 * queued segments with allowed fragmentation.1835 */1836 {1837 intnew_mss = min(sk->mtu, sk->max_window);
1838 if (new_mss < sk->mss)
1839 {1840 tcp_send_partial(sk);
1841 sk->mss = new_mss;
1842 }1843 }1844 #endif1845
1846 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1847 {1848 inthdrlen;
1849
1850 /* IP header + TCP header */1851 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1852 + sizeof(structtcphdr);
1853
1854 /* Add more stuff to the end of skb->len */1855 if (!(flags & MSG_OOB))
1856 {1857 copy = min(sk->mss - (skb->len - hdrlen), seglen);
1858 if (copy <= 0)
1859 {1860 printk("TCP: **bug**: \"copy\" <= 0\n");
1861 return -EFAULT;
1862 }1863 memcpy_fromfs(skb_put(skb,copy), from, copy);
1864 from += copy;
1865 copied += copy;
1866 len -= copy;
1867 sk->write_seq += copy;
1868 seglen -= copy;
1869 }1870 if ((skb->len - hdrlen) >= sk->mss ||
1871 (flags & MSG_OOB) || !sk->packets_out)
1872 tcp_send_skb(sk, skb);
1873 else1874 tcp_enqueue_partial(skb, sk);
1875 continue;
1876 }1877
1878 /*1879 * We also need to worry about the window.1880 * If window < 1/2 the maximum window we've seen from this1881 * host, don't use it. This is sender side1882 * silly window prevention, as specified in RFC1122.1883 * (Note that this is different than earlier versions of1884 * SWS prevention, e.g. RFC813.). What we actually do is 1885 * use the whole MSS. Since the results in the right1886 * edge of the packet being outside the window, it will1887 * be queued for later rather than sent.1888 */1889
1890 copy = sk->window_seq - sk->write_seq;
1891 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1892 copy = sk->mss;
1893 if (copy > seglen)
1894 copy = seglen;
1895
1896 /*1897 * We should really check the window here also. 1898 */1899
1900 send_tmp = NULL;
1901 if (copy < sk->mss && !(flags & MSG_OOB))
1902 {1903 /*1904 * We will release the socket in case we sleep here. 1905 */1906 release_sock(sk);
1907 /*1908 * NB: following must be mtu, because mss can be increased.1909 * mss is always <= mtu 1910 */1911 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1912 sk->inuse = 1;
1913 send_tmp = skb;
1914 }1915 else1916 {1917 /*1918 * We will release the socket in case we sleep here. 1919 */1920 release_sock(sk);
1921 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1922 sk->inuse = 1;
1923 }1924
1925 /*1926 * If we didn't get any memory, we need to sleep. 1927 */1928
1929 if (skb == NULL)
1930 {1931 sk->socket->flags |= SO_NOSPACE;
1932 if (nonblock)
1933 {1934 release_sock(sk);
1935 if (copied)
1936 return(copied);
1937 return(-EAGAIN);
1938 }1939
1940 /*1941 * FIXME: here is another race condition. 1942 */1943
1944 tmp = sk->wmem_alloc;
1945 release_sock(sk);
1946 cli();
1947 /*1948 * Again we will try to avoid it. 1949 */1950 if (tmp <= sk->wmem_alloc &&
1951 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1952 && sk->err == 0)
1953 {1954 sk->socket->flags &= ~SO_NOSPACE;
1955 interruptible_sleep_on(sk->sleep);
1956 if (current->signal & ~current->blocked)
1957 {1958 sti();
1959 if (copied)
1960 return(copied);
1961 return(-ERESTARTSYS);
1962 }1963 }1964 sk->inuse = 1;
1965 sti();
1966 continue;
1967 }1968
1969 skb->sk = sk;
1970 skb->free = 0;
1971 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1972
1973 /*1974 * FIXME: we need to optimize this.1975 * Perhaps some hints here would be good.1976 */1977
1978 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1979 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1980 if (tmp < 0 )
1981 {1982 sock_wfree(sk, skb);
1983 release_sock(sk);
1984 if (copied)
1985 return(copied);
1986 return(tmp);
1987 }1988 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1989 skb->ip_hdr->frag_off |= htons(IP_DF);
1990 #endif1991 skb->dev = dev;
1992 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
1993 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1994 if (tmp < 0)
1995 {1996 sock_wfree(sk, skb);
1997 release_sock(sk);
1998 if (copied)
1999 return(copied);
2000 return(tmp);
2001 }2002
2003 if (flags & MSG_OOB)
2004 {2005 skb->h.th->urg = 1;
2006 skb->h.th->urg_ptr = ntohs(copy);
2007 }2008
2009 memcpy_fromfs(skb_put(skb,copy), from, copy);
2010
2011 from += copy;
2012 copied += copy;
2013 len -= copy;
2014 seglen -= copy;
2015 skb->free = 0;
2016 sk->write_seq += copy;
2017
2018 if (send_tmp != NULL && sk->packets_out)
2019 {2020 tcp_enqueue_partial(send_tmp, sk);
2021 continue;
2022 }2023 tcp_send_skb(sk, skb);
2024 }2025 }2026 sk->err = 0;
2027
2028 /*2029 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly2030 * interactive fast network servers. It's meant to be on and2031 * it really improves the throughput though not the echo time2032 * on my slow slip link - Alan2033 */2034
2035 /*2036 * Avoid possible race on send_tmp - c/o Johannes Stille 2037 */2038
2039 if(sk->partial && ((!sk->packets_out)
2040 /* If not nagling we can send on the before case too.. */2041 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2042 ))
2043 tcp_send_partial(sk);
2044
2045 release_sock(sk);
2046 return(copied);
2047 }2048
2049 /*2050 * Send an ack if one is backlogged at this point. Ought to merge2051 * this with tcp_send_ack().2052 */2053
2054 staticvoidtcp_read_wakeup(structsock *sk)
/* */2055 {2056 inttmp;
2057 structdevice *dev = NULL;
2058 structtcphdr *t1;
2059 structsk_buff *buff;
2060
2061 if (!sk->ack_backlog)
2062 return;
2063
2064 /*2065 * If we're closed, don't send an ack, or we'll get a RST2066 * from the closed destination.2067 */2068 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2069 return;
2070
2071 /*2072 * FIXME: we need to put code here to prevent this routine from2073 * being called. Being called once in a while is ok, so only check2074 * if this is the second time in a row.2075 */2076
2077 /*2078 * We need to grab some memory, and put together an ack,2079 * and then put it into the queue to be sent.2080 */2081
2082 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2083 if (buff == NULL)
2084 {2085 /* Try again real soon. */2086 reset_xmit_timer(sk, TIME_WRITE, HZ);
2087 return;
2088 }2089
2090 buff->sk = sk;
2091 buff->localroute = sk->localroute;
2092
2093 /*2094 * Put in the IP header and routing stuff. 2095 */2096
2097 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2098 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2099 if (tmp < 0)
2100 {2101 buff->free = 1;
2102 sock_wfree(sk, buff);
2103 return;
2104 }2105
2106 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2107
2108 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2109 t1->seq = htonl(sk->sent_seq);
2110 t1->ack = 1;
2111 t1->res1 = 0;
2112 t1->res2 = 0;
2113 t1->rst = 0;
2114 t1->urg = 0;
2115 t1->syn = 0;
2116 t1->psh = 0;
2117 sk->ack_backlog = 0;
2118 sk->bytes_rcv = 0;
2119 sk->window = tcp_select_window(sk);
2120 t1->window = ntohs(sk->window);
2121 t1->ack_seq = ntohl(sk->acked_seq);
2122 t1->doff = sizeof(*t1)/4;
2123 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2124 sk->prot->queue_xmit(sk, dev, buff, 1);
2125 tcp_statistics.TcpOutSegs++;
2126 }2127
2128
2129 /*2130 * FIXME:2131 * This routine frees used buffers.2132 * It should consider sending an ACK to let the2133 * other end know we now have a bigger window.2134 */2135
2136 staticvoidcleanup_rbuf(structsock *sk)
/* */2137 {2138 unsignedlongflags;
2139 unsignedlongleft;
2140 structsk_buff *skb;
2141 unsignedlongrspace;
2142
2143 if(sk->debug)
2144 printk("cleaning rbuf for sk=%p\n", sk);
2145
2146 save_flags(flags);
2147 cli();
2148
2149 left = sock_rspace(sk);
2150
2151 /*2152 * We have to loop through all the buffer headers,2153 * and try to free up all the space we can.2154 */2155
2156 while((skb=skb_peek(&sk->receive_queue)) != NULL)
2157 {2158 if (!skb->used || skb->users)
2159 break;
2160 skb_unlink(skb);
2161 skb->sk = sk;
2162 kfree_skb(skb, FREE_READ);
2163 }2164
2165 restore_flags(flags);
2166
2167 /*2168 * FIXME:2169 * At this point we should send an ack if the difference2170 * in the window, and the amount of space is bigger than2171 * TCP_WINDOW_DIFF.2172 */2173
2174 if(sk->debug)
2175 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2176 left);
2177 if ((rspace=sock_rspace(sk)) != left)
2178 {2179 /*2180 * This area has caused the most trouble. The current strategy2181 * is to simply do nothing if the other end has room to send at2182 * least 3 full packets, because the ack from those will auto-2183 * matically update the window. If the other end doesn't think2184 * we have much space left, but we have room for at least 1 more2185 * complete packet than it thinks we do, we will send an ack2186 * immediately. Otherwise we will wait up to .5 seconds in case2187 * the user reads some more.2188 */2189 sk->ack_backlog++;
2190 /*2191 * It's unclear whether to use sk->mtu or sk->mss here. They differ only2192 * if the other end is offering a window smaller than the agreed on MSS2193 * (called sk->mtu here). In theory there's no connection between send2194 * and receive, and so no reason to think that they're going to send2195 * small packets. For the moment I'm using the hack of reducing the mss2196 * only on the send side, so I'm putting mtu here.2197 */2198
2199 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2200 {2201 /* Send an ack right now. */2202 tcp_read_wakeup(sk);
2203 }2204 else2205 {2206 /* Force it to send an ack soon. */2207 intwas_active = del_timer(&sk->retransmit_timer);
2208 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2209 {2210 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2211 }2212 else2213 add_timer(&sk->retransmit_timer);
2214 }2215 }2216 }2217
2218
2219 /*2220 * Handle reading urgent data. BSD has very simple semantics for2221 * this, no blocking and very strange errors 8)2222 */2223
2224 staticinttcp_recv_urg(structsock * sk, intnonblock,
/* */2225 structmsghdr *msg, intlen, intflags, int *addr_len)
2226 {2227 /*2228 * No URG data to read2229 */2230 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2231 return -EINVAL; /* Yes this is right ! */2232
2233 if (sk->err)
2234 returnsock_error(sk);
2235
2236 if (sk->state == TCP_CLOSE || sk->done)
2237 {2238 if (!sk->done)
2239 {2240 sk->done = 1;
2241 return 0;
2242 }2243 return -ENOTCONN;
2244 }2245
2246 if (sk->shutdown & RCV_SHUTDOWN)
2247 {2248 sk->done = 1;
2249 return 0;
2250 }2251 sk->inuse = 1;
2252 if (sk->urg_data & URG_VALID)
2253 {2254 charc = sk->urg_data;
2255 if (!(flags & MSG_PEEK))
2256 sk->urg_data = URG_READ;
2257 memcpy_toiovec(msg->msg_iov, &c, 1);
2258 if(msg->msg_name)
2259 {2260 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2261 sin->sin_family=AF_INET;
2262 sin->sin_addr.s_addr=sk->daddr;
2263 sin->sin_port=sk->dummy_th.dest;
2264 }2265 if(addr_len)
2266 *addr_len=sizeof(structsockaddr_in);
2267 release_sock(sk);
2268 return 1;
2269 }2270 release_sock(sk);
2271
2272 /*2273 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2274 * the available implementations agree in this case:2275 * this call should never block, independent of the2276 * blocking state of the socket.2277 * Mike <pall@rz.uni-karlsruhe.de>2278 */2279 return -EAGAIN;
2280 }2281
2282
2283 /*2284 * This routine copies from a sock struct into the user buffer. 2285 */2286
2287 staticinttcp_recvmsg(structsock *sk, structmsghdr *msg,
/* */2288 intlen, intnonblock, intflags, int *addr_len)
2289 {2290 structwait_queuewait = {current, NULL};
2291 intcopied = 0;
2292 u32peek_seq;
2293 volatileu32 *seq; /* So gcc doesn't overoptimise */2294 unsignedlongused;
2295
2296 /* 2297 * This error should be checked. 2298 */2299
2300 if (sk->state == TCP_LISTEN)
2301 return -ENOTCONN;
2302
2303 /*2304 * Urgent data needs to be handled specially. 2305 */2306
2307 if (flags & MSG_OOB)
2308 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2309
2310 /*2311 * Copying sequence to update. This is volatile to handle2312 * the multi-reader case neatly (memcpy_to/fromfs might be 2313 * inline and thus not flush cached variables otherwise).2314 */2315
2316 peek_seq = sk->copied_seq;
2317 seq = &sk->copied_seq;
2318 if (flags & MSG_PEEK)
2319 seq = &peek_seq;
2320
2321 add_wait_queue(sk->sleep, &wait);
2322 sk->inuse = 1;
2323 while (len > 0)
2324 {2325 structsk_buff * skb;
2326 u32offset;
2327
2328 /*2329 * Are we at urgent data? Stop if we have read anything.2330 */2331
2332 if (copied && sk->urg_data && sk->urg_seq == *seq)
2333 break;
2334
2335 /*2336 * Next get a buffer.2337 */2338
2339 current->state = TASK_INTERRUPTIBLE;
2340
2341 skb = skb_peek(&sk->receive_queue);
2342 do2343 {2344 if (!skb)
2345 break;
2346 if (before(*seq, skb->h.th->seq))
2347 break;
2348 offset = *seq - skb->h.th->seq;
2349 if (skb->h.th->syn)
2350 offset--;
2351 if (offset < skb->len)
2352 gotofound_ok_skb;
2353 if (skb->h.th->fin)
2354 gotofound_fin_ok;
2355 if (!(flags & MSG_PEEK))
2356 skb->used = 1;
2357 skb = skb->next;
2358 }2359 while (skb != (structsk_buff *)&sk->receive_queue);
2360
2361 if (copied)
2362 break;
2363
2364 if (sk->err)
2365 {2366 copied = sock_error(sk);
2367 break;
2368 }2369
2370 if (sk->state == TCP_CLOSE)
2371 {2372 if (!sk->done)
2373 {2374 sk->done = 1;
2375 break;
2376 }2377 copied = -ENOTCONN;
2378 break;
2379 }2380
2381 if (sk->shutdown & RCV_SHUTDOWN)
2382 {2383 sk->done = 1;
2384 break;
2385 }2386
2387 if (nonblock)
2388 {2389 copied = -EAGAIN;
2390 break;
2391 }2392
2393 cleanup_rbuf(sk);
2394 release_sock(sk);
2395 sk->socket->flags |= SO_WAITDATA;
2396 schedule();
2397 sk->socket->flags &= ~SO_WAITDATA;
2398 sk->inuse = 1;
2399
2400 if (current->signal & ~current->blocked)
2401 {2402 copied = -ERESTARTSYS;
2403 break;
2404 }2405 continue;
2406
2407 found_ok_skb:
2408 /*2409 * Lock the buffer. We can be fairly relaxed as2410 * an interrupt will never steal a buffer we are 2411 * using unless I've missed something serious in2412 * tcp_data.2413 */2414
2415 skb->users++;
2416
2417 /*2418 * Ok so how much can we use ? 2419 */2420
2421 used = skb->len - offset;
2422 if (len < used)
2423 used = len;
2424 /*2425 * Do we have urgent data here? 2426 */2427
2428 if (sk->urg_data)
2429 {2430 u32urg_offset = sk->urg_seq - *seq;
2431 if (urg_offset < used)
2432 {2433 if (!urg_offset)
2434 {2435 if (!sk->urginline)
2436 {2437 ++*seq;
2438 offset++;
2439 used--;
2440 }2441 }2442 else2443 used = urg_offset;
2444 }2445 }2446
2447 /*2448 * Copy it - We _MUST_ update *seq first so that we2449 * don't ever double read when we have dual readers2450 */2451
2452 *seq += used;
2453
2454 /*2455 * This memcpy_tofs can sleep. If it sleeps and we2456 * do a second read it relies on the skb->users to avoid2457 * a crash when cleanup_rbuf() gets called.2458 */2459
2460 memcpy_toiovec(msg->msg_iov,((unsignedchar *)skb->h.th) +
2461 skb->h.th->doff*4 + offset, used);
2462 copied += used;
2463 len -= used;
2464
2465 /*2466 * We now will not sleep again until we are finished2467 * with skb. Sorry if you are doing the SMP port2468 * but you'll just have to fix it neatly ;)2469 */2470
2471 skb->users --;
2472
2473 if (after(sk->copied_seq,sk->urg_seq))
2474 sk->urg_data = 0;
2475 if (used + offset < skb->len)
2476 continue;
2477
2478 /*2479 * Process the FIN.2480 */2481
2482 if (skb->h.th->fin)
2483 gotofound_fin_ok;
2484 if (flags & MSG_PEEK)
2485 continue;
2486 skb->used = 1;
2487 continue;
2488
2489 found_fin_ok:
2490 ++*seq;
2491 if (flags & MSG_PEEK)
2492 break;
2493
2494 /*2495 * All is done2496 */2497
2498 skb->used = 1;
2499 sk->shutdown |= RCV_SHUTDOWN;
2500 break;
2501
2502 }2503
2504 if(copied>0 && msg->msg_name)
2505 {2506 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2507 sin->sin_family=AF_INET;
2508 sin->sin_addr.s_addr=sk->daddr;
2509 sin->sin_port=sk->dummy_th.dest;
2510 }2511 if(addr_len)
2512 *addr_len=sizeof(structsockaddr_in);
2513
2514 remove_wait_queue(sk->sleep, &wait);
2515 current->state = TASK_RUNNING;
2516
2517 /* Clean up data we have read: This will do ACK frames */2518 cleanup_rbuf(sk);
2519 release_sock(sk);
2520 returncopied;
2521 }2522
2523
2524
2525 /*2526 * State processing on a close. This implements the state shift for2527 * sending our FIN frame. Note that we only send a FIN for some 2528 * states. A shutdown() may have already sent the FIN, or we may be2529 * closed.2530 */2531
2532 staticinttcp_close_state(structsock *sk, intdead)
/* */2533 {2534 intns=TCP_CLOSE;
2535 intsend_fin=0;
2536 switch(sk->state)
2537 {2538 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2539 break;
2540 caseTCP_SYN_RECV:
2541 caseTCP_ESTABLISHED: /* Closedown begin */2542 ns=TCP_FIN_WAIT1;
2543 send_fin=1;
2544 break;
2545 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2546 caseTCP_FIN_WAIT2:
2547 caseTCP_CLOSING:
2548 ns=sk->state;
2549 break;
2550 caseTCP_CLOSE:
2551 caseTCP_LISTEN:
2552 break;
2553 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2554 wait only for the ACK */2555 ns=TCP_LAST_ACK;
2556 send_fin=1;
2557 }2558
2559 tcp_set_state(sk,ns);
2560
2561 /*2562 * This is a (useful) BSD violating of the RFC. There is a2563 * problem with TCP as specified in that the other end could2564 * keep a socket open forever with no application left this end.2565 * We use a 3 minute timeout (about the same as BSD) then kill2566 * our end. If they send after that then tough - BUT: long enough2567 * that we won't make the old 4*rto = almost no time - whoops2568 * reset mistake.2569 */2570 if(dead && ns==TCP_FIN_WAIT2)
2571 {2572 inttimer_active=del_timer(&sk->timer);
2573 if(timer_active)
2574 add_timer(&sk->timer);
2575 else2576 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2577 }2578
2579 returnsend_fin;
2580 }2581
2582 /*2583 * Send a fin.2584 */2585
2586 staticvoidtcp_send_fin(structsock *sk)
/* */2587 {2588 structproto *prot =(structproto *)sk->prot;
2589 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2590 structtcphdr *t1;
2591 structsk_buff *buff;
2592 structdevice *dev=NULL;
2593 inttmp;
2594
2595 release_sock(sk); /* in case the malloc sleeps. */2596
2597 buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2598 sk->inuse = 1;
2599
2600 if (buff == NULL)
2601 {2602 /* This is a disaster if it occurs */2603 printk("tcp_send_fin: Impossible malloc failure");
2604 return;
2605 }2606
2607 /*2608 * Administrivia2609 */2610
2611 buff->sk = sk;
2612 buff->localroute = sk->localroute;
2613
2614 /*2615 * Put in the IP header and routing stuff. 2616 */2617
2618 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2619 IPPROTO_TCP, sk->opt,
2620 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2621 if (tmp < 0)
2622 {2623 intt;
2624 /*2625 * Finish anyway, treat this as a send that got lost. 2626 * (Not good).2627 */2628
2629 buff->free = 1;
2630 sock_wfree(sk,buff);
2631 sk->write_seq++;
2632 t=del_timer(&sk->timer);
2633 if(t)
2634 add_timer(&sk->timer);
2635 else2636 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2637 return;
2638 }2639
2640 /*2641 * We ought to check if the end of the queue is a buffer and2642 * if so simply add the fin to that buffer, not send it ahead.2643 */2644
2645 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2646 buff->dev = dev;
2647 memcpy(t1, th, sizeof(*t1));
2648 t1->seq = ntohl(sk->write_seq);
2649 sk->write_seq++;
2650 buff->h.seq = sk->write_seq;
2651 t1->ack = 1;
2652 t1->ack_seq = ntohl(sk->acked_seq);
2653 t1->window = ntohs(sk->window=tcp_select_window(sk));
2654 t1->fin = 1;
2655 t1->rst = 0;
2656 t1->doff = sizeof(*t1)/4;
2657 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2658
2659 /*2660 * If there is data in the write queue, the fin must be appended to2661 * the write queue.2662 */2663
2664 if (skb_peek(&sk->write_queue) != NULL)
2665 {2666 buff->free = 0;
2667 if (buff->next != NULL)
2668 {2669 printk("tcp_send_fin: next != NULL\n");
2670 skb_unlink(buff);
2671 }2672 skb_queue_tail(&sk->write_queue, buff);
2673 }2674 else2675 {2676 sk->sent_seq = sk->write_seq;
2677 sk->prot->queue_xmit(sk, dev, buff, 0);
2678 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2679 }2680 }2681
2682 /*2683 * Shutdown the sending side of a connection. Much like close except2684 * that we don't receive shut down or set sk->dead=1.2685 */2686
2687 voidtcp_shutdown(structsock *sk, inthow)
/* */2688 {2689 /*2690 * We need to grab some memory, and put together a FIN,2691 * and then put it into the queue to be sent.2692 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2693 */2694
2695 if (!(how & SEND_SHUTDOWN))
2696 return;
2697
2698 /*2699 * If we've already sent a FIN, or it's a closed state2700 */2701
2702 if (sk->state == TCP_FIN_WAIT1 ||
2703 sk->state == TCP_FIN_WAIT2 ||
2704 sk->state == TCP_CLOSING ||
2705 sk->state == TCP_LAST_ACK ||
2706 sk->state == TCP_TIME_WAIT ||
2707 sk->state == TCP_CLOSE ||
2708 sk->state == TCP_LISTEN2709 )
2710 {2711 return;
2712 }2713 sk->inuse = 1;
2714
2715 /*2716 * flag that the sender has shutdown2717 */2718
2719 sk->shutdown |= SEND_SHUTDOWN;
2720
2721 /*2722 * Clear out any half completed packets. 2723 */2724
2725 if (sk->partial)
2726 tcp_send_partial(sk);
2727
2728 /*2729 * FIN if needed2730 */2731
2732 if(tcp_close_state(sk,0))
2733 tcp_send_fin(sk);
2734
2735 release_sock(sk);
2736 }2737
2738 /*2739 * This routine will send an RST to the other tcp. 2740 */2741
2742 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2743 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2744 {2745 structsk_buff *buff;
2746 structtcphdr *t1;
2747 inttmp;
2748 structdevice *ndev=NULL;
2749
2750 /*2751 * Cannot reset a reset (Think about it).2752 */2753
2754 if(th->rst)
2755 return;
2756
2757 /*2758 * We need to grab some memory, and put together an RST,2759 * and then put it into the queue to be sent.2760 */2761
2762 buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2763 if (buff == NULL)
2764 return;
2765
2766 buff->sk = NULL;
2767 buff->dev = dev;
2768 buff->localroute = 0;
2769
2770 /*2771 * Put in the IP header and routing stuff. 2772 */2773
2774 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2775 sizeof(structtcphdr),tos,ttl,NULL);
2776 if (tmp < 0)
2777 {2778 buff->free = 1;
2779 sock_wfree(NULL, buff);
2780 return;
2781 }2782
2783 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2784 memcpy(t1, th, sizeof(*t1));
2785
2786 /*2787 * Swap the send and the receive. 2788 */2789
2790 t1->dest = th->source;
2791 t1->source = th->dest;
2792 t1->rst = 1;
2793 t1->window = 0;
2794
2795 if(th->ack)
2796 {2797 t1->ack = 0;
2798 t1->seq = th->ack_seq;
2799 t1->ack_seq = 0;
2800 }2801 else2802 {2803 t1->ack = 1;
2804 if(!th->syn)
2805 t1->ack_seq=htonl(th->seq);
2806 else2807 t1->ack_seq=htonl(th->seq+1);
2808 t1->seq=0;
2809 }2810
2811 t1->syn = 0;
2812 t1->urg = 0;
2813 t1->fin = 0;
2814 t1->psh = 0;
2815 t1->doff = sizeof(*t1)/4;
2816 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2817 prot->queue_xmit(NULL, ndev, buff, 1);
2818 tcp_statistics.TcpOutSegs++;
2819 }2820
2821
2822 /*2823 * Look for tcp options. Parses everything but only knows about MSS.2824 * This routine is always called with the packet containing the SYN.2825 * However it may also be called with the ack to the SYN. So you2826 * can't assume this is always the SYN. It's always called after2827 * we have set up sk->mtu to our own MTU.2828 *2829 * We need at minimum to add PAWS support here. Possibly large windows2830 * as Linux gets deployed on 100Mb/sec networks.2831 */2832
2833 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2834 {2835 unsignedchar *ptr;
2836 intlength=(th->doff*4)-sizeof(structtcphdr);
2837 intmss_seen = 0;
2838
2839 ptr = (unsignedchar *)(th + 1);
2840
2841 while(length>0)
2842 {2843 intopcode=*ptr++;
2844 intopsize=*ptr++;
2845 switch(opcode)
2846 {2847 caseTCPOPT_EOL:
2848 return;
2849 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2850 length--;
2851 ptr--; /* the opsize=*ptr++ above was a mistake */2852 continue;
2853
2854 default:
2855 if(opsize<=2) /* Avoid silly options looping forever */2856 return;
2857 switch(opcode)
2858 {2859 caseTCPOPT_MSS:
2860 if(opsize==4 && th->syn)
2861 {2862 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2863 mss_seen = 1;
2864 }2865 break;
2866 /* Add other options here as people feel the urge to implement stuff like large windows */2867 }2868 ptr+=opsize-2;
2869 length-=opsize;
2870 }2871 }2872 if (th->syn)
2873 {2874 if (! mss_seen)
2875 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2876 }2877 #ifdefCONFIG_INET_PCTCP2878 sk->mss = min(sk->max_window >> 1, sk->mtu);
2879 #else2880 sk->mss = min(sk->max_window, sk->mtu);
2881 #endif2882 }2883
2884 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2885 {2886 dst = ntohl(dst);
2887 if (IN_CLASSA(dst))
2888 returnhtonl(IN_CLASSA_NET);
2889 if (IN_CLASSB(dst))
2890 returnhtonl(IN_CLASSB_NET);
2891 returnhtonl(IN_CLASSC_NET);
2892 }2893
2894 /*2895 * Default sequence number picking algorithm.2896 * As close as possible to RFC 793, which2897 * suggests using a 250kHz clock.2898 * Further reading shows this assumes 2MB/s networks.2899 * For 10MB/s ethernet, a 1MHz clock is appropriate.2900 * That's funny, Linux has one built in! Use it!2901 */2902
2903 externinlineu32tcp_init_seq(void)
/* */2904 {2905 structtimevaltv;
2906 do_gettimeofday(&tv);
2907 returntv.tv_usec+tv.tv_sec*1000000;
2908 }2909
2910 /*2911 * This routine handles a connection request.2912 * It should make sure we haven't already responded.2913 * Because of the way BSD works, we have to send a syn/ack now.2914 * This also means it will be harder to close a socket which is2915 * listening.2916 */2917
2918 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2919 unsignedlongdaddr, unsignedlongsaddr,
2920 structoptions *opt, structdevice *dev, u32seq)
2921 {2922 structsk_buff *buff;
2923 structtcphdr *t1;
2924 unsignedchar *ptr;
2925 structsock *newsk;
2926 structtcphdr *th;
2927 structdevice *ndev=NULL;
2928 inttmp;
2929 structrtable *rt;
2930
2931 th = skb->h.th;
2932
2933 /* If the socket is dead, don't accept the connection. */2934 if (!sk->dead)
2935 {2936 sk->data_ready(sk,0);
2937 }2938 else2939 {2940 if(sk->debug)
2941 printk("Reset on %p: Connect on dead socket.\n",sk);
2942 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2943 tcp_statistics.TcpAttemptFails++;
2944 kfree_skb(skb, FREE_READ);
2945 return;
2946 }2947
2948 /*2949 * Make sure we can accept more. This will prevent a2950 * flurry of syns from eating up all our memory.2951 */2952
2953 if (sk->ack_backlog >= sk->max_ack_backlog)
2954 {2955 tcp_statistics.TcpAttemptFails++;
2956 kfree_skb(skb, FREE_READ);
2957 return;
2958 }2959
2960 /*2961 * We need to build a new sock struct.2962 * It is sort of bad to have a socket without an inode attached2963 * to it, but the wake_up's will just wake up the listening socket,2964 * and if the listening socket is destroyed before this is taken2965 * off of the queue, this will take care of it.2966 */2967
2968 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2969 if (newsk == NULL)
2970 {2971 /* just ignore the syn. It will get retransmitted. */2972 tcp_statistics.TcpAttemptFails++;
2973 kfree_skb(skb, FREE_READ);
2974 return;
2975 }2976
2977 memcpy(newsk, sk, sizeof(*newsk));
2978 newsk->opt = NULL;
2979 newsk->ip_route_cache = NULL;
2980 if (opt && opt->optlen) {2981 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
2982 if (!sk->opt) {2983 kfree_s(newsk, sizeof(structsock));
2984 tcp_statistics.TcpAttemptFails++;
2985 kfree_skb(skb, FREE_READ);
2986 return;
2987 }2988 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {2989 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
2990 kfree_s(newsk, sizeof(structsock));
2991 tcp_statistics.TcpAttemptFails++;
2992 kfree_skb(skb, FREE_READ);
2993 return;
2994 }2995 }2996 skb_queue_head_init(&newsk->write_queue);
2997 skb_queue_head_init(&newsk->receive_queue);
2998 newsk->send_head = NULL;
2999 newsk->send_tail = NULL;
3000 skb_queue_head_init(&newsk->back_log);
3001 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/3002 newsk->rto = TCP_TIMEOUT_INIT;
3003 newsk->mdev = 0;
3004 newsk->max_window = 0;
3005 newsk->cong_window = 1;
3006 newsk->cong_count = 0;
3007 newsk->ssthresh = 0;
3008 newsk->backoff = 0;
3009 newsk->blog = 0;
3010 newsk->intr = 0;
3011 newsk->proc = 0;
3012 newsk->done = 0;
3013 newsk->partial = NULL;
3014 newsk->pair = NULL;
3015 newsk->wmem_alloc = 0;
3016 newsk->rmem_alloc = 0;
3017 newsk->localroute = sk->localroute;
3018
3019 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3020
3021 newsk->err = 0;
3022 newsk->shutdown = 0;
3023 newsk->ack_backlog = 0;
3024 newsk->acked_seq = skb->h.th->seq+1;
3025 newsk->copied_seq = skb->h.th->seq+1;
3026 newsk->fin_seq = skb->h.th->seq;
3027 newsk->state = TCP_SYN_RECV;
3028 newsk->timeout = 0;
3029 newsk->ip_xmit_timeout = 0;
3030 newsk->write_seq = seq;
3031 newsk->window_seq = newsk->write_seq;
3032 newsk->rcv_ack_seq = newsk->write_seq;
3033 newsk->urg_data = 0;
3034 newsk->retransmits = 0;
3035 newsk->linger=0;
3036 newsk->destroy = 0;
3037 init_timer(&newsk->timer);
3038 newsk->timer.data = (unsignedlong)newsk;
3039 newsk->timer.function = &net_timer;
3040 init_timer(&newsk->retransmit_timer);
3041 newsk->retransmit_timer.data = (unsignedlong)newsk;
3042 newsk->retransmit_timer.function=&retransmit_timer;
3043 newsk->dummy_th.source = skb->h.th->dest;
3044 newsk->dummy_th.dest = skb->h.th->source;
3045
3046 /*3047 * Swap these two, they are from our point of view. 3048 */3049
3050 newsk->daddr = saddr;
3051 newsk->saddr = daddr;
3052 newsk->rcv_saddr = daddr;
3053
3054 put_sock(newsk->num,newsk);
3055 newsk->dummy_th.res1 = 0;
3056 newsk->dummy_th.doff = 6;
3057 newsk->dummy_th.fin = 0;
3058 newsk->dummy_th.syn = 0;
3059 newsk->dummy_th.rst = 0;
3060 newsk->dummy_th.psh = 0;
3061 newsk->dummy_th.ack = 0;
3062 newsk->dummy_th.urg = 0;
3063 newsk->dummy_th.res2 = 0;
3064 newsk->acked_seq = skb->h.th->seq + 1;
3065 newsk->copied_seq = skb->h.th->seq + 1;
3066 newsk->socket = NULL;
3067
3068 /*3069 * Grab the ttl and tos values and use them 3070 */3071
3072 newsk->ip_ttl=sk->ip_ttl;
3073 newsk->ip_tos=skb->ip_hdr->tos;
3074
3075 /*3076 * Use 512 or whatever user asked for 3077 */3078
3079 /*3080 * Note use of sk->user_mss, since user has no direct access to newsk 3081 */3082
3083 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3084 newsk->ip_route_cache = rt;
3085
3086 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3087 newsk->window_clamp = rt->rt_window;
3088 else3089 newsk->window_clamp = 0;
3090
3091 if (sk->user_mss)
3092 newsk->mtu = sk->user_mss;
3093 elseif (rt)
3094 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
3095 else3096 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
3097
3098 /*3099 * But not bigger than device MTU 3100 */3101
3102 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
3103
3104 #ifdefCONFIG_SKIP3105
3106 /*3107 * SKIP devices set their MTU to 65535. This is so they can take packets3108 * unfragmented to security process then fragment. They could lie to the3109 * TCP layer about a suitable MTU, but its easier to let skip sort it out3110 * simply because the final package we want unfragmented is going to be3111 *3112 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]3113 */3114
3115 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */3116 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3117 #endif3118 /*3119 * This will min with what arrived in the packet 3120 */3121
3122 tcp_options(newsk,skb->h.th);
3123
3124 tcp_cache_zap();
3125
3126 buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3127 if (buff == NULL)
3128 {3129 sk->err = ENOMEM;
3130 newsk->dead = 1;
3131 newsk->state = TCP_CLOSE;
3132 /* And this will destroy it */3133 release_sock(newsk);
3134 kfree_skb(skb, FREE_READ);
3135 tcp_statistics.TcpAttemptFails++;
3136 return;
3137 }3138
3139 buff->sk = newsk;
3140 buff->localroute = newsk->localroute;
3141
3142 /*3143 * Put in the IP header and routing stuff. 3144 */3145
3146 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3147 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3148
3149 /*3150 * Something went wrong. 3151 */3152
3153 if (tmp < 0)
3154 {3155 sk->err = tmp;
3156 buff->free = 1;
3157 kfree_skb(buff,FREE_WRITE);
3158 newsk->dead = 1;
3159 newsk->state = TCP_CLOSE;
3160 release_sock(newsk);
3161 skb->sk = sk;
3162 kfree_skb(skb, FREE_READ);
3163 tcp_statistics.TcpAttemptFails++;
3164 return;
3165 }3166
3167 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
3168
3169 memcpy(t1, skb->h.th, sizeof(*t1));
3170 buff->h.seq = newsk->write_seq;
3171 /*3172 * Swap the send and the receive. 3173 */3174 t1->dest = skb->h.th->source;
3175 t1->source = newsk->dummy_th.source;
3176 t1->seq = ntohl(newsk->write_seq++);
3177 t1->ack = 1;
3178 newsk->window = tcp_select_window(newsk);
3179 newsk->sent_seq = newsk->write_seq;
3180 t1->window = ntohs(newsk->window);
3181 t1->res1 = 0;
3182 t1->res2 = 0;
3183 t1->rst = 0;
3184 t1->urg = 0;
3185 t1->psh = 0;
3186 t1->syn = 1;
3187 t1->ack_seq = ntohl(skb->h.th->seq+1);
3188 t1->doff = sizeof(*t1)/4+1;
3189 ptr = skb_put(buff,4);
3190 ptr[0] = 2;
3191 ptr[1] = 4;
3192 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3193 ptr[3] =(newsk->mtu) & 0xff;
3194
3195 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3196 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3197 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3198 skb->sk = newsk;
3199
3200 /*3201 * Charge the sock_buff to newsk. 3202 */3203
3204 sk->rmem_alloc -= skb->truesize;
3205 newsk->rmem_alloc += skb->truesize;
3206
3207 skb_queue_tail(&sk->receive_queue,skb);
3208 sk->ack_backlog++;
3209 release_sock(newsk);
3210 tcp_statistics.TcpOutSegs++;
3211 }3212
3213
3214 staticvoidtcp_close(structsock *sk, inttimeout)
/* */3215 {3216 /*3217 * We need to grab some memory, and put together a FIN, 3218 * and then put it into the queue to be sent.3219 */3220
3221 sk->inuse = 1;
3222
3223 if(th_cache_sk==sk)
3224 tcp_cache_zap();
3225 if(sk->state == TCP_LISTEN)
3226 {3227 /* Special case */3228 tcp_set_state(sk, TCP_CLOSE);
3229 tcp_close_pending(sk);
3230 release_sock(sk);
3231 return;
3232 }3233
3234 sk->keepopen = 1;
3235 sk->shutdown = SHUTDOWN_MASK;
3236
3237 if (!sk->dead)
3238 sk->state_change(sk);
3239
3240 if (timeout == 0)
3241 {3242 structsk_buff *skb;
3243
3244 /*3245 * We need to flush the recv. buffs. We do this only on the3246 * descriptor close, not protocol-sourced closes, because the3247 * reader process may not have drained the data yet!3248 */3249
3250 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3251 kfree_skb(skb, FREE_READ);
3252 /*3253 * Get rid off any half-completed packets. 3254 */3255
3256 if (sk->partial)
3257 tcp_send_partial(sk);
3258 }3259
3260
3261 /*3262 * Timeout is not the same thing - however the code likes3263 * to send both the same way (sigh).3264 */3265
3266 if(timeout)
3267 {3268 tcp_set_state(sk, TCP_CLOSE); /* Dead */3269 }3270 else3271 {3272 if(tcp_close_state(sk,1)==1)
3273 {3274 tcp_send_fin(sk);
3275 }3276 }3277 release_sock(sk);
3278 }3279
3280
3281 /*3282 * This routine takes stuff off of the write queue,3283 * and puts it in the xmit queue. This happens as incoming acks3284 * open up the remote window for us.3285 */3286
3287 staticvoidtcp_write_xmit(structsock *sk)
/* */3288 {3289 structsk_buff *skb;
3290
3291 /*3292 * The bytes will have to remain here. In time closedown will3293 * empty the write queue and all will be happy 3294 */3295
3296 if(sk->zapped)
3297 return;
3298
3299 /*3300 * Anything on the transmit queue that fits the window can3301 * be added providing we are not3302 *3303 * a) retransmitting (Nagle's rule)3304 * b) exceeding our congestion window.3305 */3306
3307 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3308 before(skb->h.seq, sk->window_seq + 1) &&
3309 (sk->retransmits == 0 ||
3310 sk->ip_xmit_timeout != TIME_WRITE ||
3311 before(skb->h.seq, sk->rcv_ack_seq + 1))
3312 && sk->packets_out < sk->cong_window)
3313 {3314 IS_SKB(skb);
3315 skb_unlink(skb);
3316
3317 /*3318 * See if we really need to send the packet. 3319 */3320
3321 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3322 {3323 /*3324 * This is acked data. We can discard it. This 3325 * cannot currently occur.3326 */3327
3328 sk->retransmits = 0;
3329 kfree_skb(skb, FREE_WRITE);
3330 if (!sk->dead)
3331 sk->write_space(sk);
3332 }3333 else3334 {3335 structtcphdr *th;
3336 structiphdr *iph;
3337 intsize;
3338 /*3339 * put in the ack seq and window at this point rather than earlier,3340 * in order to keep them monotonic. We really want to avoid taking3341 * back window allocations. That's legal, but RFC1122 says it's frowned on.3342 * Ack and window will in general have changed since this packet was put3343 * on the write queue.3344 */3345 iph = skb->ip_hdr;
3346 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3347 size = skb->len - (((unsignedchar *) th) - skb->data);
3348 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY3349 if (size > sk->mtu - sizeof(structiphdr))
3350 {3351 iph->frag_off &= ~htons(IP_DF);
3352 ip_send_check(iph);
3353 }3354 #endif3355
3356 th->ack_seq = ntohl(sk->acked_seq);
3357 th->window = ntohs(tcp_select_window(sk));
3358
3359 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3360
3361 sk->sent_seq = skb->h.seq;
3362
3363 /*3364 * IP manages our queue for some crazy reason3365 */3366
3367 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3368
3369 /*3370 * Again we slide the timer wrongly3371 */3372
3373 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3374 }3375 }3376 }3377
3378
3379 /*3380 * This routine deals with incoming acks, but not outgoing ones.3381 */3382
3383 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3384 {3385 u32ack;
3386 intflag = 0;
3387
3388 /* 3389 * 1 - there was data in packet as well as ack or new data is sent or 3390 * in shutdown state3391 * 2 - data from retransmit queue was acked and removed3392 * 4 - window shrunk or data from retransmit queue was acked and removed3393 */3394
3395 if(sk->zapped)
3396 return(1); /* Dead, cant ack any more so why bother */3397
3398 /*3399 * Have we discovered a larger window3400 */3401
3402 ack = ntohl(th->ack_seq);
3403
3404 if (ntohs(th->window) > sk->max_window)
3405 {3406 sk->max_window = ntohs(th->window);
3407 #ifdefCONFIG_INET_PCTCP3408 /* Hack because we don't send partial packets to non SWS3409 handling hosts */3410 sk->mss = min(sk->max_window>>1, sk->mtu);
3411 #else3412 sk->mss = min(sk->max_window, sk->mtu);
3413 #endif3414 }3415
3416 /*3417 * We have dropped back to keepalive timeouts. Thus we have3418 * no retransmits pending.3419 */3420
3421 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3422 sk->retransmits = 0;
3423
3424 /*3425 * If the ack is newer than sent or older than previous acks3426 * then we can probably ignore it.3427 */3428
3429 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3430 {3431 if(sk->debug)
3432 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3433
3434 /*3435 * Keepalive processing.3436 */3437
3438 if (after(ack, sk->sent_seq))
3439 {3440 return(0);
3441 }3442
3443 /*3444 * Restart the keepalive timer.3445 */3446
3447 if (sk->keepopen)
3448 {3449 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3450 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3451 }3452 return(1);
3453 }3454
3455 /*3456 * If there is data set flag 13457 */3458
3459 if (len != th->doff*4)
3460 flag |= 1;
3461
3462 /*3463 * See if our window has been shrunk. 3464 */3465
3466 if (after(sk->window_seq, ack+ntohs(th->window)))
3467 {3468 /*3469 * We may need to move packets from the send queue3470 * to the write queue, if the window has been shrunk on us.3471 * The RFC says you are not allowed to shrink your window3472 * like this, but if the other end does, you must be able3473 * to deal with it.3474 */3475 structsk_buff *skb;
3476 structsk_buff *skb2;
3477 structsk_buff *wskb = NULL;
3478
3479 skb2 = sk->send_head;
3480 sk->send_head = NULL;
3481 sk->send_tail = NULL;
3482
3483 /*3484 * This is an artifact of a flawed concept. We want one3485 * queue and a smarter send routine when we send all.3486 */3487
3488 flag |= 4; /* Window changed */3489
3490 sk->window_seq = ack + ntohs(th->window);
3491 cli();
3492 while (skb2 != NULL)
3493 {3494 skb = skb2;
3495 skb2 = skb->link3;
3496 skb->link3 = NULL;
3497 if (after(skb->h.seq, sk->window_seq))
3498 {3499 if (sk->packets_out > 0)
3500 sk->packets_out--;
3501 /* We may need to remove this from the dev send list. */3502 if (skb->next != NULL)
3503 {3504 skb_unlink(skb);
3505 }3506 /* Now add it to the write_queue. */3507 if (wskb == NULL)
3508 skb_queue_head(&sk->write_queue,skb);
3509 else3510 skb_append(wskb,skb);
3511 wskb = skb;
3512 }3513 else3514 {3515 if (sk->send_head == NULL)
3516 {3517 sk->send_head = skb;
3518 sk->send_tail = skb;
3519 }3520 else3521 {3522 sk->send_tail->link3 = skb;
3523 sk->send_tail = skb;
3524 }3525 skb->link3 = NULL;
3526 }3527 }3528 sti();
3529 }3530
3531 /*3532 * Pipe has emptied3533 */3534
3535 if (sk->send_tail == NULL || sk->send_head == NULL)
3536 {3537 sk->send_head = NULL;
3538 sk->send_tail = NULL;
3539 sk->packets_out= 0;
3540 }3541
3542 /*3543 * Update the right hand window edge of the host3544 */3545
3546 sk->window_seq = ack + ntohs(th->window);
3547
3548 /*3549 * We don't want too many packets out there. 3550 */3551
3552 if (sk->ip_xmit_timeout == TIME_WRITE &&
3553 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3554 {3555 /* 3556 * This is Jacobson's slow start and congestion avoidance. 3557 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3558 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3559 * counter and increment it once every cwnd times. It's possible3560 * that this should be done only if sk->retransmits == 0. I'm3561 * interpreting "new data is acked" as including data that has3562 * been retransmitted but is just now being acked.3563 */3564 if (sk->cong_window < sk->ssthresh)
3565 /* 3566 * In "safe" area, increase3567 */3568 sk->cong_window++;
3569 else3570 {3571 /*3572 * In dangerous area, increase slowly. In theory this is3573 * sk->cong_window += 1 / sk->cong_window3574 */3575 if (sk->cong_count >= sk->cong_window)
3576 {3577 sk->cong_window++;
3578 sk->cong_count = 0;
3579 }3580 else3581 sk->cong_count++;
3582 }3583 }3584
3585 /*3586 * Remember the highest ack received.3587 */3588
3589 sk->rcv_ack_seq = ack;
3590
3591 /*3592 * If this ack opens up a zero window, clear backoff. It was3593 * being used to time the probes, and is probably far higher than3594 * it needs to be for normal retransmission.3595 */3596
3597 if (sk->ip_xmit_timeout == TIME_PROBE0)
3598 {3599 sk->retransmits = 0; /* Our probe was answered */3600
3601 /*3602 * Was it a usable window open ?3603 */3604
3605 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3606 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3607 {3608 sk->backoff = 0;
3609
3610 /*3611 * Recompute rto from rtt. this eliminates any backoff.3612 */3613
3614 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3615 if (sk->rto > 120*HZ)
3616 sk->rto = 120*HZ;
3617 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3618 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3619 .2 of a second is going to need huge windows (SIGH) */3620 sk->rto = 20;
3621 }3622 }3623
3624 /* 3625 * See if we can take anything off of the retransmit queue.3626 */3627
3628 while(sk->send_head != NULL)
3629 {3630 /* Check for a bug. */3631 if (sk->send_head->link3 &&
3632 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3633 printk("INET: tcp.c: *** bug send_list out of order.\n");
3634
3635 /*3636 * If our packet is before the ack sequence we can3637 * discard it as it's confirmed to have arrived the other end.3638 */3639
3640 if (before(sk->send_head->h.seq, ack+1))
3641 {3642 structsk_buff *oskb;
3643 if (sk->retransmits)
3644 {3645 /*3646 * We were retransmitting. don't count this in RTT est 3647 */3648 flag |= 2;
3649
3650 /*3651 * even though we've gotten an ack, we're still3652 * retransmitting as long as we're sending from3653 * the retransmit queue. Keeping retransmits non-zero3654 * prevents us from getting new data interspersed with3655 * retransmissions.3656 */3657
3658 if (sk->send_head->link3) /* Any more queued retransmits? */3659 sk->retransmits = 1;
3660 else3661 sk->retransmits = 0;
3662 }3663 /*3664 * Note that we only reset backoff and rto in the3665 * rtt recomputation code. And that doesn't happen3666 * if there were retransmissions in effect. So the3667 * first new packet after the retransmissions is3668 * sent with the backoff still in effect. Not until3669 * we get an ack from a non-retransmitted packet do3670 * we reset the backoff and rto. This allows us to deal3671 * with a situation where the network delay has increased3672 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3673 */3674
3675 /*3676 * We have one less packet out there. 3677 */3678
3679 if (sk->packets_out > 0)
3680 sk->packets_out --;
3681 /* 3682 * Wake up the process, it can probably write more. 3683 */3684 if (!sk->dead)
3685 sk->write_space(sk);
3686 oskb = sk->send_head;
3687
3688 if (!(flag&2)) /* Not retransmitting */3689 {3690 longm;
3691
3692 /*3693 * The following amusing code comes from Jacobson's3694 * article in SIGCOMM '88. Note that rtt and mdev3695 * are scaled versions of rtt and mean deviation.3696 * This is designed to be as fast as possible 3697 * m stands for "measurement".3698 */3699
3700 m = jiffies - oskb->when; /* RTT */3701 if(m<=0)
3702 m=1; /* IS THIS RIGHT FOR <0 ??? */3703 m -= (sk->rtt >> 3); /* m is now error in rtt est */3704 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3705 if (m < 0)
3706 m = -m; /* m is now abs(error) */3707 m -= (sk->mdev >> 2); /* similar update on mdev */3708 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3709
3710 /*3711 * Now update timeout. Note that this removes any backoff.3712 */3713
3714 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3715 if (sk->rto > 120*HZ)
3716 sk->rto = 120*HZ;
3717 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3718 sk->rto = 20;
3719 sk->backoff = 0;
3720 }3721 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3722 In this case as we just set it up */3723 cli();
3724 oskb = sk->send_head;
3725 IS_SKB(oskb);
3726 sk->send_head = oskb->link3;
3727 if (sk->send_head == NULL)
3728 {3729 sk->send_tail = NULL;
3730 }3731
3732 /*3733 * We may need to remove this from the dev send list. 3734 */3735
3736 if (oskb->next)
3737 skb_unlink(oskb);
3738 sti();
3739 kfree_skb(oskb, FREE_WRITE); /* write. */3740 if (!sk->dead)
3741 sk->write_space(sk);
3742 }3743 else3744 {3745 break;
3746 }3747 }3748
3749 /*3750 * XXX someone ought to look at this too.. at the moment, if skb_peek()3751 * returns non-NULL, we complete ignore the timer stuff in the else3752 * clause. We ought to organize the code so that else clause can3753 * (should) be executed regardless, possibly moving the PROBE timer3754 * reset over. The skb_peek() thing should only move stuff to the3755 * write queue, NOT also manage the timer functions.3756 */3757
3758 /*3759 * Maybe we can take some stuff off of the write queue,3760 * and put it onto the xmit queue.3761 */3762 if (skb_peek(&sk->write_queue) != NULL)
3763 {3764 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3765 (sk->retransmits == 0 ||
3766 sk->ip_xmit_timeout != TIME_WRITE ||
3767 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3768 && sk->packets_out < sk->cong_window)
3769 {3770 /*3771 * Add more data to the send queue.3772 */3773 flag |= 1;
3774 tcp_write_xmit(sk);
3775 }3776 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3777 sk->send_head == NULL &&
3778 sk->ack_backlog == 0 &&
3779 sk->state != TCP_TIME_WAIT)
3780 {3781 /*3782 * Data to queue but no room.3783 */3784 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3785 }3786 }3787 else3788 {3789 /*3790 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3791 * from TCP_CLOSE we don't do anything3792 *3793 * from anything else, if there is write data (or fin) pending,3794 * we use a TIME_WRITE timeout, else if keepalive we reset to3795 * a KEEPALIVE timeout, else we delete the timer.3796 *3797 * We do not set flag for nominal write data, otherwise we may3798 * force a state where we start to write itsy bitsy tidbits3799 * of data.3800 */3801
3802 switch(sk->state) {3803 caseTCP_TIME_WAIT:
3804 /*3805 * keep us in TIME_WAIT until we stop getting packets,3806 * reset the timeout.3807 */3808 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3809 break;
3810 caseTCP_CLOSE:
3811 /*3812 * don't touch the timer.3813 */3814 break;
3815 default:
3816 /*3817 * Must check send_head, write_queue, and ack_backlog3818 * to determine which timeout to use.3819 */3820 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3821 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3822 }elseif (sk->keepopen) {3823 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3824 }else{3825 del_timer(&sk->retransmit_timer);
3826 sk->ip_xmit_timeout = 0;
3827 }3828 break;
3829 }3830 }3831
3832 /*3833 * We have nothing queued but space to send. Send any partial3834 * packets immediately (end of Nagle rule application).3835 */3836
3837 if (sk->packets_out == 0 && sk->partial != NULL &&
3838 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3839 {3840 flag |= 1;
3841 tcp_send_partial(sk);
3842 }3843
3844 /*3845 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3846 * we are now waiting for an acknowledge to our FIN. The other end is3847 * already in TIME_WAIT.3848 *3849 * Move to TCP_CLOSE on success.3850 */3851
3852 if (sk->state == TCP_LAST_ACK)
3853 {3854 if (!sk->dead)
3855 sk->state_change(sk);
3856 if(sk->debug)
3857 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3858 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3859 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3860 {3861 flag |= 1;
3862 tcp_set_state(sk,TCP_CLOSE);
3863 sk->shutdown = SHUTDOWN_MASK;
3864 }3865 }3866
3867 /*3868 * Incoming ACK to a FIN we sent in the case of our initiating the close.3869 *3870 * Move to FIN_WAIT2 to await a FIN from the other end. Set3871 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3872 */3873
3874 if (sk->state == TCP_FIN_WAIT1)
3875 {3876
3877 if (!sk->dead)
3878 sk->state_change(sk);
3879 if (sk->rcv_ack_seq == sk->write_seq)
3880 {3881 flag |= 1;
3882 sk->shutdown |= SEND_SHUTDOWN;
3883 tcp_set_state(sk, TCP_FIN_WAIT2);
3884 }3885 }3886
3887 /*3888 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3889 *3890 * Move to TIME_WAIT3891 */3892
3893 if (sk->state == TCP_CLOSING)
3894 {3895
3896 if (!sk->dead)
3897 sk->state_change(sk);
3898 if (sk->rcv_ack_seq == sk->write_seq)
3899 {3900 flag |= 1;
3901 tcp_time_wait(sk);
3902 }3903 }3904
3905 /*3906 * Final ack of a three way shake 3907 */3908
3909 if(sk->state==TCP_SYN_RECV)
3910 {3911 tcp_set_state(sk, TCP_ESTABLISHED);
3912 tcp_options(sk,th);
3913 sk->dummy_th.dest=th->source;
3914 sk->copied_seq = sk->acked_seq;
3915 if(!sk->dead)
3916 sk->state_change(sk);
3917 if(sk->max_window==0)
3918 {3919 sk->max_window=32; /* Sanity check */3920 sk->mss=min(sk->max_window,sk->mtu);
3921 }3922 }3923
3924 /*3925 * I make no guarantees about the first clause in the following3926 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3927 * what conditions "!flag" would be true. However I think the rest3928 * of the conditions would prevent that from causing any3929 * unnecessary retransmission. 3930 * Clearly if the first packet has expired it should be 3931 * retransmitted. The other alternative, "flag&2 && retransmits", is3932 * harder to explain: You have to look carefully at how and when the3933 * timer is set and with what timeout. The most recent transmission always3934 * sets the timer. So in general if the most recent thing has timed3935 * out, everything before it has as well. So we want to go ahead and3936 * retransmit some more. If we didn't explicitly test for this3937 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3938 * would not be true. If you look at the pattern of timing, you can3939 * show that rto is increased fast enough that the next packet would3940 * almost never be retransmitted immediately. Then you'd end up3941 * waiting for a timeout to send each packet on the retransmission3942 * queue. With my implementation of the Karn sampling algorithm,3943 * the timeout would double each time. The net result is that it would3944 * take a hideous amount of time to recover from a single dropped packet.3945 * It's possible that there should also be a test for TIME_WRITE, but3946 * I think as long as "send_head != NULL" and "retransmit" is on, we've3947 * got to be in real retransmission mode.3948 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3949 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3950 * As long as no further losses occur, this seems reasonable.3951 */3952
3953 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3954 (((flag&2) && sk->retransmits) ||
3955 (sk->send_head->when + sk->rto < jiffies)))
3956 {3957 if(sk->send_head->when + sk->rto < jiffies)
3958 tcp_retransmit(sk,0);
3959 else3960 {3961 tcp_do_retransmit(sk, 1);
3962 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3963 }3964 }3965
3966 return(1);
3967 }3968
3969
3970 /*3971 * Process the FIN bit. This now behaves as it is supposed to work3972 * and the FIN takes effect when it is validly part of sequence3973 * space. Not before when we get holes.3974 *3975 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3976 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3977 * TIME-WAIT)3978 *3979 * If we are in FINWAIT-1, a received FIN indicates simultaneous3980 * close and we go into CLOSING (and later onto TIME-WAIT)3981 *3982 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3983 *3984 */3985
3986 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3987 {3988 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3989
3990 if (!sk->dead)
3991 {3992 sk->state_change(sk);
3993 sock_wake_async(sk->socket, 1);
3994 }3995
3996 switch(sk->state)
3997 {3998 caseTCP_SYN_RECV:
3999 caseTCP_SYN_SENT:
4000 caseTCP_ESTABLISHED:
4001 /*4002 * move to CLOSE_WAIT, tcp_data() already handled4003 * sending the ack.4004 */4005 tcp_set_state(sk,TCP_CLOSE_WAIT);
4006 if (th->rst)
4007 sk->shutdown = SHUTDOWN_MASK;
4008 break;
4009
4010 caseTCP_CLOSE_WAIT:
4011 caseTCP_CLOSING:
4012 /*4013 * received a retransmission of the FIN, do4014 * nothing.4015 */4016 break;
4017 caseTCP_TIME_WAIT:
4018 /*4019 * received a retransmission of the FIN,4020 * restart the TIME_WAIT timer.4021 */4022 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4023 return(0);
4024 caseTCP_FIN_WAIT1:
4025 /*4026 * This case occurs when a simultaneous close4027 * happens, we must ack the received FIN and4028 * enter the CLOSING state.4029 *4030 * This causes a WRITE timeout, which will either4031 * move on to TIME_WAIT when we timeout, or resend4032 * the FIN properly (maybe we get rid of that annoying4033 * FIN lost hang). The TIME_WRITE code is already correct4034 * for handling this timeout.4035 */4036
4037 if(sk->ip_xmit_timeout != TIME_WRITE)
4038 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4039 tcp_set_state(sk,TCP_CLOSING);
4040 break;
4041 caseTCP_FIN_WAIT2:
4042 /*4043 * received a FIN -- send ACK and enter TIME_WAIT4044 */4045 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4046 sk->shutdown|=SHUTDOWN_MASK;
4047 tcp_set_state(sk,TCP_TIME_WAIT);
4048 break;
4049 caseTCP_CLOSE:
4050 /*4051 * already in CLOSE4052 */4053 break;
4054 default:
4055 tcp_set_state(sk,TCP_LAST_ACK);
4056
4057 /* Start the timers. */4058 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4059 return(0);
4060 }4061
4062 return(0);
4063 }4064
4065
4066
4067 /*4068 * This routine handles the data. If there is room in the buffer,4069 * it will be have already been moved into it. If there is no4070 * room, then we will just have to discard the packet.4071 */4072
4073 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */4074 unsignedlongsaddr, unsignedshortlen)
4075 {4076 structsk_buff *skb1, *skb2;
4077 structtcphdr *th;
4078 intdup_dumped=0;
4079 u32new_seq, shut_seq;
4080
4081 th = skb->h.th;
4082 skb_pull(skb,th->doff*4);
4083 skb_trim(skb,len-(th->doff*4));
4084
4085 /*4086 * The bytes in the receive read/assembly queue has increased. Needed for the4087 * low memory discard algorithm 4088 */4089
4090 sk->bytes_rcv += skb->len;
4091
4092 if (skb->len == 0 && !th->fin)
4093 {4094 /* 4095 * Don't want to keep passing ack's back and forth. 4096 * (someone sent us dataless, boring frame)4097 */4098 if (!th->ack)
4099 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4100 kfree_skb(skb, FREE_READ);
4101 return(0);
4102 }4103
4104 /*4105 * We no longer have anyone receiving data on this connection.4106 */4107
4108 #ifndef TCP_DONT_RST_SHUTDOWN
4109
4110 if(sk->shutdown & RCV_SHUTDOWN)
4111 {4112 /*4113 * FIXME: BSD has some magic to avoid sending resets to4114 * broken 4.2 BSD keepalives. Much to my surprise a few non4115 * BSD stacks still have broken keepalives so we want to4116 * cope with it.4117 */4118
4119 if(skb->len) /* We don't care if it's just an ack or4120 a keepalive/window probe */4121 {4122 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */4123
4124 /* Do this the way 4.4BSD treats it. Not what I'd4125 regard as the meaning of the spec but it's what BSD4126 does and clearly they know everything 8) */4127
4128 /*4129 * This is valid because of two things4130 *4131 * a) The way tcp_data behaves at the bottom.4132 * b) A fin takes effect when read not when received.4133 */4134
4135 shut_seq=sk->acked_seq+1; /* Last byte */4136
4137 if(after(new_seq,shut_seq))
4138 {4139 if(sk->debug)
4140 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4141 sk, new_seq, shut_seq, sk->blog);
4142 if(sk->dead)
4143 {4144 sk->acked_seq = new_seq + th->fin;
4145 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4146 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4147 tcp_statistics.TcpEstabResets++;
4148 tcp_set_state(sk,TCP_CLOSE);
4149 sk->err = EPIPE;
4150 sk->shutdown = SHUTDOWN_MASK;
4151 kfree_skb(skb, FREE_READ);
4152 return 0;
4153 }4154 }4155 }4156 }4157
4158 #endif4159
4160 /*4161 * Now we have to walk the chain, and figure out where this one4162 * goes into it. This is set up so that the last packet we received4163 * will be the first one we look at, that way if everything comes4164 * in order, there will be no performance loss, and if they come4165 * out of order we will be able to fit things in nicely.4166 *4167 * [AC: This is wrong. We should assume in order first and then walk4168 * forwards from the first hole based upon real traffic patterns.]4169 * 4170 */4171
4172 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */4173 {4174 skb_queue_head(&sk->receive_queue,skb);
4175 skb1= NULL;
4176 }4177 else4178 {4179 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4180 {4181 if(sk->debug)
4182 {4183 printk("skb1=%p :", skb1);
4184 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4185 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4186 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4187 sk->acked_seq);
4188 }4189
4190 /*4191 * Optimisation: Duplicate frame or extension of previous frame from4192 * same sequence point (lost ack case).4193 * The frame contains duplicate data or replaces a previous frame4194 * discard the previous frame (safe as sk->inuse is set) and put4195 * the new one in its place.4196 */4197
4198 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4199 {4200 skb_append(skb1,skb);
4201 skb_unlink(skb1);
4202 kfree_skb(skb1,FREE_READ);
4203 dup_dumped=1;
4204 skb1=NULL;
4205 break;
4206 }4207
4208 /*4209 * Found where it fits4210 */4211
4212 if (after(th->seq+1, skb1->h.th->seq))
4213 {4214 skb_append(skb1,skb);
4215 break;
4216 }4217
4218 /*4219 * See if we've hit the start. If so insert.4220 */4221 if (skb1 == skb_peek(&sk->receive_queue))
4222 {4223 skb_queue_head(&sk->receive_queue, skb);
4224 break;
4225 }4226 }4227 }4228
4229 /*4230 * Figure out what the ack value for this frame is4231 */4232
4233 th->ack_seq = th->seq + skb->len;
4234 if (th->syn)
4235 th->ack_seq++;
4236 if (th->fin)
4237 th->ack_seq++;
4238
4239 if (before(sk->acked_seq, sk->copied_seq))
4240 {4241 printk("*** tcp.c:tcp_data bug acked < copied\n");
4242 sk->acked_seq = sk->copied_seq;
4243 }4244
4245 /*4246 * Now figure out if we can ack anything. This is very messy because we really want two4247 * receive queues, a completed and an assembly queue. We also want only one transmit4248 * queue.4249 */4250
4251 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
4252 {4253 if (before(th->seq, sk->acked_seq+1))
4254 {4255 intnewwindow;
4256
4257 if (after(th->ack_seq, sk->acked_seq))
4258 {4259 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4260 if (newwindow < 0)
4261 newwindow = 0;
4262 sk->window = newwindow;
4263 sk->acked_seq = th->ack_seq;
4264 }4265 skb->acked = 1;
4266
4267 /*4268 * When we ack the fin, we do the FIN 4269 * processing.4270 */4271
4272 if (skb->h.th->fin)
4273 {4274 tcp_fin(skb,sk,skb->h.th);
4275 }4276
4277 for(skb2 = skb->next;
4278 skb2 != (structsk_buff *)&sk->receive_queue;
4279 skb2 = skb2->next)
4280 {4281 if (before(skb2->h.th->seq, sk->acked_seq+1))
4282 {4283 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4284 {4285 newwindow = sk->window -
4286 (skb2->h.th->ack_seq - sk->acked_seq);
4287 if (newwindow < 0)
4288 newwindow = 0;
4289 sk->window = newwindow;
4290 sk->acked_seq = skb2->h.th->ack_seq;
4291 }4292 skb2->acked = 1;
4293 /*4294 * When we ack the fin, we do4295 * the fin handling.4296 */4297 if (skb2->h.th->fin)
4298 {4299 tcp_fin(skb,sk,skb->h.th);
4300 }4301
4302 /*4303 * Force an immediate ack.4304 */4305
4306 sk->ack_backlog = sk->max_ack_backlog;
4307 }4308 else4309 {4310 break;
4311 }4312 }4313
4314 /*4315 * This also takes care of updating the window.4316 * This if statement needs to be simplified.4317 */4318 if (!sk->delay_acks ||
4319 sk->ack_backlog >= sk->max_ack_backlog ||
4320 sk->bytes_rcv > sk->max_unacked || th->fin) {4321 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4322 }4323 else4324 {4325 sk->ack_backlog++;
4326 if(sk->debug)
4327 printk("Ack queued.\n");
4328 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4329 }4330 }4331 }4332
4333 /*4334 * If we've missed a packet, send an ack.4335 * Also start a timer to send another.4336 */4337
4338 if (!skb->acked)
4339 {4340
4341 /*4342 * This is important. If we don't have much room left,4343 * we need to throw out a few packets so we have a good4344 * window. Note that mtu is used, not mss, because mss is really4345 * for the send side. He could be sending us stuff as large as mtu.4346 */4347
4348 while (sock_rspace(sk) < sk->mtu)
4349 {4350 skb1 = skb_peek(&sk->receive_queue);
4351 if (skb1 == NULL)
4352 {4353 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4354 break;
4355 }4356
4357 /*4358 * Don't throw out something that has been acked. 4359 */4360
4361 if (skb1->acked)
4362 {4363 break;
4364 }4365
4366 skb_unlink(skb1);
4367 kfree_skb(skb1, FREE_READ);
4368 }4369 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4370 sk->ack_backlog++;
4371 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4372 }4373 else4374 {4375 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4376 }4377
4378 /*4379 * Now tell the user we may have some data. 4380 */4381
4382 if (!sk->dead)
4383 {4384 if(sk->debug)
4385 printk("Data wakeup.\n");
4386 sk->data_ready(sk,0);
4387 }4388 return(0);
4389 }4390
4391
4392 /*4393 * This routine is only called when we have urgent data4394 * signalled. Its the 'slow' part of tcp_urg. It could be4395 * moved inline now as tcp_urg is only called from one4396 * place. We handle URGent data wrong. We have to - as4397 * BSD still doesn't use the correction from RFC961.4398 */4399
4400 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4401 {4402 u32ptr = ntohs(th->urg_ptr);
4403
4404 if (ptr)
4405 ptr--;
4406 ptr += th->seq;
4407
4408 /* ignore urgent data that we've already seen and read */4409 if (after(sk->copied_seq, ptr))
4410 return;
4411
4412 /* do we already have a newer (or duplicate) urgent pointer? */4413 if (sk->urg_data && !after(ptr, sk->urg_seq))
4414 return;
4415
4416 /* tell the world about our new urgent pointer */4417 if (sk->proc != 0) {4418 if (sk->proc > 0) {4419 kill_proc(sk->proc, SIGURG, 1);
4420 }else{4421 kill_pg(-sk->proc, SIGURG, 1);
4422 }4423 }4424 sk->urg_data = URG_NOTYET;
4425 sk->urg_seq = ptr;
4426 }4427
4428 /*4429 * This is the 'fast' part of urgent handling.4430 */4431
4432 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4433 unsignedlongsaddr, unsignedlonglen)
4434 {4435 u32ptr;
4436
4437 /*4438 * Check if we get a new urgent pointer - normally not 4439 */4440
4441 if (th->urg)
4442 tcp_check_urg(sk,th);
4443
4444 /*4445 * Do we wait for any urgent data? - normally not4446 */4447
4448 if (sk->urg_data != URG_NOTYET)
4449 return 0;
4450
4451 /*4452 * Is the urgent pointer pointing into this packet? 4453 */4454
4455 ptr = sk->urg_seq - th->seq + th->doff*4;
4456 if (ptr >= len)
4457 return 0;
4458
4459 /*4460 * Ok, got the correct packet, update info 4461 */4462
4463 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4464 if (!sk->dead)
4465 sk->data_ready(sk,0);
4466 return 0;
4467 }4468
4469 /*4470 * This will accept the next outstanding connection. 4471 */4472
4473 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4474 {4475 structsock *newsk;
4476 structsk_buff *skb;
4477
4478 /*4479 * We need to make sure that this socket is listening,4480 * and that it has something pending.4481 */4482
4483 if (sk->state != TCP_LISTEN)
4484 {4485 sk->err = EINVAL;
4486 return(NULL);
4487 }4488
4489 /* Avoid the race. */4490 cli();
4491 sk->inuse = 1;
4492
4493 while((skb = tcp_dequeue_established(sk)) == NULL)
4494 {4495 if (flags & O_NONBLOCK)
4496 {4497 sti();
4498 release_sock(sk);
4499 sk->err = EAGAIN;
4500 return(NULL);
4501 }4502
4503 release_sock(sk);
4504 interruptible_sleep_on(sk->sleep);
4505 if (current->signal & ~current->blocked)
4506 {4507 sti();
4508 sk->err = ERESTARTSYS;
4509 return(NULL);
4510 }4511 sk->inuse = 1;
4512 }4513 sti();
4514
4515 /*4516 * Now all we need to do is return skb->sk. 4517 */4518
4519 newsk = skb->sk;
4520
4521 kfree_skb(skb, FREE_READ);
4522 sk->ack_backlog--;
4523 release_sock(sk);
4524 return(newsk);
4525 }4526
4527
4528 /*4529 * This will initiate an outgoing connection. 4530 */4531
4532 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4533 {4534 structsk_buff *buff;
4535 structdevice *dev=NULL;
4536 unsignedchar *ptr;
4537 inttmp;
4538 intatype;
4539 structtcphdr *t1;
4540 structrtable *rt;
4541
4542 if (sk->state != TCP_CLOSE)
4543 return(-EISCONN);
4544
4545 /*4546 * Don't allow a double connect.4547 */4548
4549 if(sk->daddr)
4550 return -EINVAL;
4551
4552 if (addr_len < 8)
4553 return(-EINVAL);
4554
4555 if (usin->sin_family && usin->sin_family != AF_INET)
4556 return(-EAFNOSUPPORT);
4557
4558 /*4559 * connect() to INADDR_ANY means loopback (BSD'ism).4560 */4561
4562 if(usin->sin_addr.s_addr==INADDR_ANY)
4563 usin->sin_addr.s_addr=ip_my_addr();
4564
4565 /*4566 * Don't want a TCP connection going to a broadcast address 4567 */4568
4569 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4570 return -ENETUNREACH;
4571
4572 sk->inuse = 1;
4573 sk->daddr = usin->sin_addr.s_addr;
4574 sk->write_seq = tcp_init_seq();
4575 sk->window_seq = sk->write_seq;
4576 sk->rcv_ack_seq = sk->write_seq -1;
4577 sk->err = 0;
4578 sk->dummy_th.dest = usin->sin_port;
4579 release_sock(sk);
4580
4581 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4582 if (buff == NULL)
4583 {4584 return(-ENOMEM);
4585 }4586 sk->inuse = 1;
4587 buff->sk = sk;
4588 buff->free = 0;
4589 buff->localroute = sk->localroute;
4590
4591
4592 /*4593 * Put in the IP header and routing stuff.4594 */4595
4596 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4597 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4598 if (tmp < 0)
4599 {4600 sock_wfree(sk, buff);
4601 release_sock(sk);
4602 return(-ENETUNREACH);
4603 }4604 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4605 sk->saddr = rt->rt_src;
4606 sk->rcv_saddr = sk->saddr;
4607
4608 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
4609
4610 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4611 t1->seq = ntohl(sk->write_seq++);
4612 sk->sent_seq = sk->write_seq;
4613 buff->h.seq = sk->write_seq;
4614 t1->ack = 0;
4615 t1->window = 2;
4616 t1->res1=0;
4617 t1->res2=0;
4618 t1->rst = 0;
4619 t1->urg = 0;
4620 t1->psh = 0;
4621 t1->syn = 1;
4622 t1->urg_ptr = 0;
4623 t1->doff = 6;
4624 /* use 512 or whatever user asked for */4625
4626 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4627 sk->window_clamp=rt->rt_window;
4628 else4629 sk->window_clamp=0;
4630
4631 if (sk->user_mss)
4632 sk->mtu = sk->user_mss;
4633 elseif (rt)
4634 sk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
4635 else4636 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
4637
4638 /*4639 * but not bigger than device MTU 4640 */4641
4642 if(sk->mtu <32)
4643 sk->mtu = 32; /* Sanity limit */4644
4645 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
4646
4647 #ifdefCONFIG_SKIP4648
4649 /*4650 * SKIP devices set their MTU to 65535. This is so they can take packets4651 * unfragmented to security process then fragment. They could lie to the4652 * TCP layer about a suitable MTU, but its easier to let skip sort it out4653 * simply because the final package we want unfragmented is going to be4654 *4655 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]4656 */4657
4658 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */4659 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4660 #endif4661
4662 /*4663 * Put in the TCP options to say MTU. 4664 */4665
4666 ptr = skb_put(buff,4);
4667 ptr[0] = 2;
4668 ptr[1] = 4;
4669 ptr[2] = (sk->mtu) >> 8;
4670 ptr[3] = (sk->mtu) & 0xff;
4671 tcp_send_check(t1, sk->saddr, sk->daddr,
4672 sizeof(structtcphdr) + 4, sk);
4673
4674 /*4675 * This must go first otherwise a really quick response will get reset. 4676 */4677
4678 tcp_cache_zap();
4679 tcp_set_state(sk,TCP_SYN_SENT);
4680 if(rt&&rt->rt_flags&RTF_IRTT)
4681 sk->rto = rt->rt_irtt;
4682 else4683 sk->rto = TCP_TIMEOUT_INIT;
4684 sk->retransmit_timer.function=&retransmit_timer;
4685 sk->retransmit_timer.data = (unsignedlong)sk;
4686 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4687 sk->retransmits = 0; /* Now works the right way instead of a hacked 4688 initial setting */4689
4690 sk->prot->queue_xmit(sk, dev, buff, 0);
4691 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4692 tcp_statistics.TcpActiveOpens++;
4693 tcp_statistics.TcpOutSegs++;
4694
4695 release_sock(sk);
4696 return(0);
4697 }4698
4699
4700 /*4701 * This functions checks to see if the tcp header is actually acceptable. 4702 */4703
4704 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4705 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4706 {4707 u32next_seq;
4708
4709 next_seq = len - 4*th->doff;
4710 if (th->fin)
4711 next_seq++;
4712 /* if we have a zero window, we can't have any data in the packet.. */4713 if (next_seq && !sk->window)
4714 gotoignore_it;
4715 next_seq += th->seq;
4716
4717 /*4718 * This isn't quite right. sk->acked_seq could be more recent4719 * than sk->window. This is however close enough. We will accept4720 * slightly more packets than we should, but it should not cause4721 * problems unless someone is trying to forge packets.4722 */4723
4724 /* have we already seen all of this packet? */4725 if (!after(next_seq+1, sk->acked_seq))
4726 gotoignore_it;
4727 /* or does it start beyond the window? */4728 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4729 gotoignore_it;
4730
4731 /* ok, at least part of this packet would seem interesting.. */4732 return 1;
4733
4734 ignore_it:
4735 if (th->rst)
4736 return 0;
4737
4738 /*4739 * Send a reset if we get something not ours and we are4740 * unsynchronized. Note: We don't do anything to our end. We4741 * are just killing the bogus remote connection then we will4742 * connect again and it will work (with luck).4743 */4744
4745 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4746 {4747 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4748 return 1;
4749 }4750
4751 /* Try to resync things. */4752 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4753 return 0;
4754 }4755
4756 /*4757 * When we get a reset we do this.4758 */4759
4760 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4761 {4762 sk->zapped = 1;
4763 sk->err = ECONNRESET;
4764 if (sk->state == TCP_SYN_SENT)
4765 sk->err = ECONNREFUSED;
4766 if (sk->state == TCP_CLOSE_WAIT)
4767 sk->err = EPIPE;
4768 #ifdef TCP_DO_RFC1337
4769 /*4770 * Time wait assassination protection [RFC1337]4771 */4772 if(sk->state!=TCP_TIME_WAIT)
4773 {4774 tcp_set_state(sk,TCP_CLOSE);
4775 sk->shutdown = SHUTDOWN_MASK;
4776 }4777 #else4778 tcp_set_state(sk,TCP_CLOSE);
4779 sk->shutdown = SHUTDOWN_MASK;
4780 #endif4781 if (!sk->dead)
4782 sk->state_change(sk);
4783 kfree_skb(skb, FREE_READ);
4784 release_sock(sk);
4785 return(0);
4786 }4787
4788 /*4789 * A TCP packet has arrived.4790 * skb->h.raw is the TCP header.4791 */4792
4793 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4794 __u32daddr, unsignedshortlen,
4795 __u32saddr, intredo, structinet_protocol * protocol)
4796 {4797 structtcphdr *th;
4798 structsock *sk;
4799 intsyn_ok=0;
4800
4801 tcp_statistics.TcpInSegs++;
4802 if(skb->pkt_type!=PACKET_HOST)
4803 {4804 kfree_skb(skb,FREE_READ);
4805 return(0);
4806 }4807
4808 th = skb->h.th;
4809
4810 /*4811 * Find the socket, using the last hit cache if applicable.4812 */4813
4814 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4815 {4816 sk=(structsock *)th_cache_sk;
4817 /*4818 * We think this is causing the bug so4819 */4820 if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4821 printk("Cache mismatch on TCP.\n");
4822 }4823 else4824 {4825 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4826 th_cache_saddr=saddr;
4827 th_cache_daddr=daddr;
4828 th_cache_dport=th->dest;
4829 th_cache_sport=th->source;
4830 th_cache_sk=sk;
4831 }4832
4833 /*4834 * If this socket has got a reset it's to all intents and purposes 4835 * really dead. Count closed sockets as dead.4836 *4837 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4838 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4839 * exist so should cause resets as if the port was unreachable.4840 */4841
4842 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4843 sk=NULL;
4844
4845 if (!redo)
4846 {4847 /*4848 * Pull up the IP header.4849 */4850 skb_pull(skb, skb->h.raw-skb->data);
4851 /*4852 * Try to use the device checksum if provided.4853 */4854 if (
4855 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4856 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4857 )
4858 {4859 skb->sk = NULL;
4860 kfree_skb(skb,FREE_READ);
4861 /*4862 * We don't release the socket because it was4863 * never marked in use.4864 */4865 return(0);
4866 }4867 th->seq = ntohl(th->seq);
4868
4869 /* See if we know about the socket. */4870 if (sk == NULL)
4871 {4872 /*4873 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4874 */4875 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4876 skb->sk = NULL;
4877 /*4878 * Discard frame4879 */4880 kfree_skb(skb, FREE_READ);
4881 return(0);
4882 }4883
4884 skb->acked = 0;
4885 skb->used = 0;
4886 skb->free = 0;
4887 skb->saddr = daddr;
4888 skb->daddr = saddr;
4889
4890 /* We may need to add it to the backlog here. */4891 cli();
4892 if (sk->inuse)
4893 {4894 skb_queue_tail(&sk->back_log, skb);
4895 sti();
4896 return(0);
4897 }4898 sk->inuse = 1;
4899 sti();
4900 }4901 else4902 {4903 if (sk==NULL)
4904 {4905 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4906 skb->sk = NULL;
4907 kfree_skb(skb, FREE_READ);
4908 return(0);
4909 }4910 }4911
4912
4913 if (!sk->prot)
4914 {4915 printk("IMPOSSIBLE 3\n");
4916 return(0);
4917 }4918
4919
4920 /*4921 * Charge the memory to the socket. 4922 */4923
4924 skb->sk=sk;
4925 sk->rmem_alloc += skb->truesize;
4926
4927 /*4928 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4929 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4930 * compatibility. We also set up variables more thoroughly [Karn notes in the4931 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4932 */4933
4934 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4935 {4936
4937 /*4938 * Now deal with unusual cases.4939 */4940
4941 if(sk->state==TCP_LISTEN)
4942 {4943 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4944 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4945
4946 /*4947 * We don't care for RST, and non SYN are absorbed (old segments)4948 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4949 * netmask on a running connection it can go broadcast. Even Sun's have4950 * this problem so I'm ignoring it 4951 */4952
4953 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4954 {4955 kfree_skb(skb, FREE_READ);
4956 release_sock(sk);
4957 return 0;
4958 }4959
4960 /* 4961 * Guess we need to make a new socket up 4962 */4963
4964 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4965
4966 /*4967 * Now we have several options: In theory there is nothing else4968 * in the frame. KA9Q has an option to send data with the syn,4969 * BSD accepts data with the syn up to the [to be] advertised window4970 * and Solaris 2.1 gives you a protocol error. For now we just ignore4971 * it, that fits the spec precisely and avoids incompatibilities. It4972 * would be nice in future to drop through and process the data.4973 */4974
4975 release_sock(sk);
4976 return 0;
4977 }4978
4979 /* retransmitted SYN? */4980 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4981 {4982 kfree_skb(skb, FREE_READ);
4983 release_sock(sk);
4984 return 0;
4985 }4986
4987 /*4988 * SYN sent means we have to look for a suitable ack and either reset4989 * for bad matches or go to connected 4990 */4991
4992 if(sk->state==TCP_SYN_SENT)
4993 {4994 /* Crossed SYN or previous junk segment */4995 if(th->ack)
4996 {4997 /* We got an ack, but it's not a good ack */4998 if(!tcp_ack(sk,th,saddr,len))
4999 {5000 /* Reset the ack - its an ack from a 5001 different connection [ th->rst is checked in tcp_reset()] */5002 tcp_statistics.TcpAttemptFails++;
5003 tcp_reset(daddr, saddr, th,
5004 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5005 kfree_skb(skb, FREE_READ);
5006 release_sock(sk);
5007 return(0);
5008 }5009 if(th->rst)
5010 returntcp_std_reset(sk,skb);
5011 if(!th->syn)
5012 {5013 /* A valid ack from a different connection5014 start. Shouldn't happen but cover it */5015 kfree_skb(skb, FREE_READ);
5016 release_sock(sk);
5017 return 0;
5018 }5019 /*5020 * Ok.. it's good. Set up sequence numbers and5021 * move to established.5022 */5023 syn_ok=1; /* Don't reset this connection for the syn */5024 sk->acked_seq=th->seq+1;
5025 sk->fin_seq=th->seq;
5026 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5027 tcp_set_state(sk, TCP_ESTABLISHED);
5028 tcp_options(sk,th);
5029 sk->dummy_th.dest=th->source;
5030 sk->copied_seq = sk->acked_seq;
5031 if(!sk->dead)
5032 {5033 sk->state_change(sk);
5034 sock_wake_async(sk->socket, 0);
5035 }5036 if(sk->max_window==0)
5037 {5038 sk->max_window = 32;
5039 sk->mss = min(sk->max_window, sk->mtu);
5040 }5041 }5042 else5043 {5044 /* See if SYN's cross. Drop if boring */5045 if(th->syn && !th->rst)
5046 {5047 /* Crossed SYN's are fine - but talking to5048 yourself is right out... */5049 if(sk->saddr==saddr && sk->daddr==daddr &&
5050 sk->dummy_th.source==th->source &&
5051 sk->dummy_th.dest==th->dest)
5052 {5053 tcp_statistics.TcpAttemptFails++;
5054 returntcp_std_reset(sk,skb);
5055 }5056 tcp_set_state(sk,TCP_SYN_RECV);
5057
5058 /*5059 * FIXME:5060 * Must send SYN|ACK here5061 */5062 }5063 /* Discard junk segment */5064 kfree_skb(skb, FREE_READ);
5065 release_sock(sk);
5066 return 0;
5067 }5068 /*5069 * SYN_RECV with data maybe.. drop through5070 */5071 gotorfc_step6;
5072 }5073
5074 /*5075 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is5076 * a more complex suggestion for fixing these reuse issues in RFC16445077 * but not yet ready for general use. Also see RFC1379.5078 */5079
5080 #defineBSD_TIME_WAIT5081 #ifdefBSD_TIME_WAIT5082 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5083 after(th->seq, sk->acked_seq) && !th->rst)
5084 {5085 u32seq = sk->write_seq;
5086 if(sk->debug)
5087 printk("Doing a BSD time wait\n");
5088 tcp_statistics.TcpEstabResets++;
5089 sk->rmem_alloc -= skb->truesize;
5090 skb->sk = NULL;
5091 sk->err=ECONNRESET;
5092 tcp_set_state(sk, TCP_CLOSE);
5093 sk->shutdown = SHUTDOWN_MASK;
5094 release_sock(sk);
5095 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5096 if (sk && sk->state==TCP_LISTEN)
5097 {5098 sk->inuse=1;
5099 skb->sk = sk;
5100 sk->rmem_alloc += skb->truesize;
5101 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5102 release_sock(sk);
5103 return 0;
5104 }5105 kfree_skb(skb, FREE_READ);
5106 return 0;
5107 }5108 #endif5109 }5110
5111 /*5112 * We are now in normal data flow (see the step list in the RFC)5113 * Note most of these are inline now. I'll inline the lot when5114 * I have time to test it hard and look at what gcc outputs 5115 */5116
5117 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5118 {5119 kfree_skb(skb, FREE_READ);
5120 release_sock(sk);
5121 return 0;
5122 }5123
5124 if(th->rst)
5125 returntcp_std_reset(sk,skb);
5126
5127 /*5128 * !syn_ok is effectively the state test in RFC793.5129 */5130
5131 if(th->syn && !syn_ok)
5132 {5133 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5134 returntcp_std_reset(sk,skb);
5135 }5136
5137 /*5138 * Process the ACK5139 */5140
5141
5142 if(th->ack && !tcp_ack(sk,th,saddr,len))
5143 {5144 /*5145 * Our three way handshake failed.5146 */5147
5148 if(sk->state==TCP_SYN_RECV)
5149 {5150 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5151 }5152 kfree_skb(skb, FREE_READ);
5153 release_sock(sk);
5154 return 0;
5155 }5156
5157 rfc_step6: /* I'll clean this up later */5158
5159 /*5160 * If the accepted buffer put us over our queue size we5161 * now drop it (we must process the ack first to avoid5162 * deadlock cases).5163 */5164
5165 if (sk->rmem_alloc >= sk->rcvbuf)
5166 {5167 kfree_skb(skb, FREE_READ);
5168 release_sock(sk);
5169 return(0);
5170 }5171
5172
5173 /*5174 * Process urgent data5175 */5176
5177 if(tcp_urg(sk, th, saddr, len))
5178 {5179 kfree_skb(skb, FREE_READ);
5180 release_sock(sk);
5181 return 0;
5182 }5183
5184 /*5185 * Process the encapsulated data5186 */5187
5188 if(tcp_data(skb,sk, saddr, len))
5189 {5190 kfree_skb(skb, FREE_READ);
5191 release_sock(sk);
5192 return 0;
5193 }5194
5195 /*5196 * And done5197 */5198
5199 release_sock(sk);
5200 return 0;
5201 }5202
5203 /*5204 * This routine sends a packet with an out of date sequence5205 * number. It assumes the other end will try to ack it.5206 */5207
5208 staticvoidtcp_write_wakeup(structsock *sk)
/* */5209 {5210 structsk_buff *buff,*skb;
5211 structtcphdr *t1;
5212 structdevice *dev=NULL;
5213 inttmp;
5214
5215 if (sk->zapped)
5216 return; /* After a valid reset we can send no more */5217
5218 /*5219 * Write data can still be transmitted/retransmitted in the5220 * following states. If any other state is encountered, return.5221 * [listen/close will never occur here anyway]5222 */5223
5224 if (sk->state != TCP_ESTABLISHED &&
5225 sk->state != TCP_CLOSE_WAIT &&
5226 sk->state != TCP_FIN_WAIT1 &&
5227 sk->state != TCP_LAST_ACK &&
5228 sk->state != TCP_CLOSING5229 )
5230 {5231 return;
5232 }5233 if ( before(sk->sent_seq, sk->window_seq) &&
5234 (skb=skb_peek(&sk->write_queue)))
5235 {5236 /*5237 * We are probing the opening of a window5238 * but the window size is != 05239 * must have been a result SWS advoidance ( sender )5240 */5241
5242 structiphdr *iph;
5243 structtcphdr *th;
5244 structtcphdr *nth;
5245 unsignedlongwin_size;
5246 #if 0
5247 unsignedlongow_size;
5248 #endif5249 void * tcp_data_start;
5250
5251 /*5252 * How many bytes can we send ?5253 */5254
5255 win_size = sk->window_seq - sk->sent_seq;
5256
5257 /*5258 * Recover the buffer pointers5259 */5260
5261 iph = (structiphdr *)skb->ip_hdr;
5262 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
5263
5264 /*5265 * Grab the data for a temporary frame5266 */5267
5268 buff = sock_wmalloc(sk, win_size + th->doff * 4 +
5269 (iph->ihl << 2) +
5270 sk->prot->max_header + 15,
5271 1, GFP_ATOMIC);
5272 if ( buff == NULL )
5273 return;
5274
5275 /* 5276 * If we strip the packet on the write queue we must5277 * be ready to retransmit this one 5278 */5279
5280 buff->free = /*0*/1;
5281
5282 buff->sk = sk;
5283 buff->localroute = sk->localroute;
5284
5285 /*5286 * Put headers on the new packet5287 */5288
5289 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5290 IPPROTO_TCP, sk->opt, buff->truesize,
5291 sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5292 if (tmp < 0)
5293 {5294 sock_wfree(sk, buff);
5295 return;
5296 }5297
5298 /*5299 * Move the TCP header over5300 */5301
5302 buff->dev = dev;
5303
5304 nth = (structtcphdr *) skb_put(buff,th->doff*4);
5305
5306 memcpy(nth, th, th->doff * 4);
5307
5308 /*5309 * Correct the new header5310 */5311
5312 nth->ack = 1;
5313 nth->ack_seq = ntohl(sk->acked_seq);
5314 nth->window = ntohs(tcp_select_window(sk));
5315 nth->check = 0;
5316
5317 /*5318 * Find the first data byte.5319 */5320
5321 tcp_data_start = skb->data + skb->dev->hard_header_len +
5322 (iph->ihl << 2) + th->doff * 4;
5323
5324 /*5325 * Add it to our new buffer5326 */5327 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5328
5329 /*5330 * Remember our right edge sequence number.5331 */5332
5333 buff->h.seq = sk->sent_seq + win_size;
5334 sk->sent_seq = buff->h.seq; /* Hack */5335 #if 0
5336
5337 /*5338 * now: shrink the queue head segment 5339 */5340
5341 th->check = 0;
5342 ow_size = skb->len - win_size -
5343 ((unsignedlong) (tcp_data_start - (void *) skb->data));
5344
5345 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5346 skb_trim(skb,skb->len-win_size);
5347 sk->sent_seq += win_size;
5348 th->seq = htonl(sk->sent_seq);
5349 if (th->urg)
5350 {5351 unsignedshorturg_ptr;
5352
5353 urg_ptr = ntohs(th->urg_ptr);
5354 if (urg_ptr <= win_size)
5355 th->urg = 0;
5356 else5357 {5358 urg_ptr -= win_size;
5359 th->urg_ptr = htons(urg_ptr);
5360 nth->urg_ptr = htons(win_size);
5361 }5362 }5363 #else5364 if(th->urg && ntohs(th->urg_ptr) < win_size)
5365 nth->urg = 0;
5366 #endif5367
5368 /*5369 * Checksum the split buffer5370 */5371
5372 tcp_send_check(nth, sk->saddr, sk->daddr,
5373 nth->doff * 4 + win_size , sk);
5374 }5375 else5376 {5377 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5378 if (buff == NULL)
5379 return;
5380
5381 buff->free = 1;
5382 buff->sk = sk;
5383 buff->localroute = sk->localroute;
5384
5385 /*5386 * Put in the IP header and routing stuff. 5387 */5388
5389 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5390 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5391 if (tmp < 0)
5392 {5393 sock_wfree(sk, buff);
5394 return;
5395 }5396
5397 t1 = (structtcphdr *)skb_put(buff,sizeof(structtcphdr));
5398 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5399
5400 /*5401 * Use a previous sequence.5402 * This should cause the other end to send an ack.5403 */5404
5405 t1->seq = htonl(sk->sent_seq-1);
5406 t1->ack = 1;
5407 t1->res1= 0;
5408 t1->res2= 0;
5409 t1->rst = 0;
5410 t1->urg = 0;
5411 t1->psh = 0;
5412 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5413 t1->syn = 0;
5414 t1->ack_seq = ntohl(sk->acked_seq);
5415 t1->window = ntohs(tcp_select_window(sk));
5416 t1->doff = sizeof(*t1)/4;
5417 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5418
5419 }5420
5421 /*5422 * Send it.5423 */5424
5425 sk->prot->queue_xmit(sk, dev, buff, 1);
5426 tcp_statistics.TcpOutSegs++;
5427 }5428
5429 /*5430 * A window probe timeout has occurred.5431 */5432
5433 voidtcp_send_probe0(structsock *sk)
/* */5434 {5435 if (sk->zapped)
5436 return; /* After a valid reset we can send no more */5437
5438 tcp_write_wakeup(sk);
5439
5440 sk->backoff++;
5441 sk->rto = min(sk->rto << 1, 120*HZ);
5442 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5443 sk->retransmits++;
5444 sk->prot->retransmits ++;
5445 }5446
5447 /*5448 * Socket option code for TCP. 5449 */5450
5451 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5452 {5453 intval,err;
5454
5455 if(level!=SOL_TCP)
5456 returnip_setsockopt(sk,level,optname,optval,optlen);
5457
5458 if (optval == NULL)
5459 return(-EINVAL);
5460
5461 err=verify_area(VERIFY_READ, optval, sizeof(int));
5462 if(err)
5463 returnerr;
5464
5465 val = get_user((int *)optval);
5466
5467 switch(optname)
5468 {5469 caseTCP_MAXSEG:
5470 /*5471 * values greater than interface MTU won't take effect. however at5472 * the point when this call is done we typically don't yet know5473 * which interface is going to be used5474 */5475 if(val<1||val>MAX_WINDOW)
5476 return -EINVAL;
5477 sk->user_mss=val;
5478 return 0;
5479 caseTCP_NODELAY:
5480 sk->nonagle=(val==0)?0:1;
5481 return 0;
5482 default:
5483 return(-ENOPROTOOPT);
5484 }5485 }5486
5487 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5488 {5489 intval,err;
5490
5491 if(level!=SOL_TCP)
5492 returnip_getsockopt(sk,level,optname,optval,optlen);
5493
5494 switch(optname)
5495 {5496 caseTCP_MAXSEG:
5497 val=sk->user_mss;
5498 break;
5499 caseTCP_NODELAY:
5500 val=sk->nonagle;
5501 break;
5502 default:
5503 return(-ENOPROTOOPT);
5504 }5505 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5506 if(err)
5507 returnerr;
5508 put_user(sizeof(int),(int *) optlen);
5509
5510 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5511 if(err)
5512 returnerr;
5513 put_user(val,(int *)optval);
5514
5515 return(0);
5516 }5517
5518
5519 structprototcp_prot = {5520 tcp_close,
5521 ip_build_header,
5522 tcp_connect,
5523 tcp_accept,
5524 ip_queue_xmit,
5525 tcp_retransmit,
5526 tcp_write_wakeup,
5527 tcp_read_wakeup,
5528 tcp_rcv,
5529 tcp_select,
5530 tcp_ioctl,
5531 NULL,
5532 tcp_shutdown,
5533 tcp_setsockopt,
5534 tcp_getsockopt,
5535 tcp_sendmsg,
5536 tcp_recvmsg,
5537 NULL, /* No special bind() */5538 128,
5539 0,
5540 "TCP",
5541 0, 0,
5542 {NULL,}5543 };