1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications. 178 * Alan Cox : rcv_saddr errors. 179 * 180 * 181 * To Fix: 182 * Fast path the code. Two things here - fix the window calculation 183 * so it doesn't iterate over the queue, also spot packets with no funny 184 * options arriving in order and process directly. 185 * 186 * Implement RFC 1191 [Path MTU discovery] 187 * Look at the effect of implementing RFC 1337 suggestions and their impact. 188 * Rewrite output state machine to use a single queue and do low window 189 * situations as per the spec (RFC 1122) 190 * Speed up input assembly algorithm. 191 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 192 * could do with it working on IPv4 193 * User settable/learned rtt/max window/mtu 194 * Cope with MTU/device switches when retransmitting in tcp. 195 * Fix the window handling to use PR's new code. 196 * 197 * Change the fundamental structure to a single send queue maintained 198 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 199 * active routes too]). Cut the queue off in tcp_retransmit/ 200 * tcp_transmit. 201 * Change the receive queue to assemble as it goes. This lets us 202 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 203 * tcp_data/tcp_read as well as the window shrink crud. 204 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 205 * tcp_queue_skb seem obvious routines to extract. 206 * 207 * This program is free software; you can redistribute it and/or 208 * modify it under the terms of the GNU General Public License 209 * as published by the Free Software Foundation; either version 210 * 2 of the License, or(at your option) any later version. 211 * 212 * Description of States: 213 * 214 * TCP_SYN_SENT sent a connection request, waiting for ack 215 * 216 * TCP_SYN_RECV received a connection request, sent ack, 217 * waiting for final ack in three-way handshake. 218 * 219 * TCP_ESTABLISHED connection established 220 * 221 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 222 * transmission of remaining buffered data 223 * 224 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 225 * to shutdown 226 * 227 * TCP_CLOSING both sides have shutdown but we still have 228 * data we have to finish sending 229 * 230 * TCP_TIME_WAIT timeout to catch resent junk before entering 231 * closed, can only be entered from FIN_WAIT2 232 * or CLOSING. Required because the other end 233 * may not have gotten our last ACK causing it 234 * to retransmit the data packet (which we ignore) 235 * 236 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 237 * us to finish writing our data and to shutdown 238 * (we have to close() to move on to LAST_ACK) 239 * 240 * TCP_LAST_ACK out side has shutdown after remote has 241 * shutdown. There may still be data in our 242 * buffer that we have to finish sending 243 * 244 * TCP_CLOSE socket is finished 245 */ 246
247 /* 248 * RFC1122 status: 249 * NOTE: I'm not going to be doing comments in the code for this one except 250 * for violations and the like. tcp.c is just too big... If I say something 251 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 252 * with Alan. -- MS 950903 253 * 254 * Use of PSH (4.2.2.2) 255 * MAY aggregate data sent without the PSH flag. (does) 256 * MAY queue data recieved without the PSH flag. (does) 257 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 258 * MAY implement PSH on send calls. (doesn't, thus:) 259 * MUST NOT buffer data indefinitely (doesn't [1 second]) 260 * MUST set PSH on last segment (does) 261 * MAY pass received PSH to application layer (doesn't) 262 * SHOULD send maximum-sized segment whenever possible. (almost always does) 263 * 264 * Window Size (4.2.2.3, 4.2.2.16) 265 * MUST treat window size as an unsigned number (does) 266 * SHOULD treat window size as a 32-bit number (does not) 267 * MUST NOT shrink window once it is offered (does not normally) 268 * 269 * Urgent Pointer (4.2.2.4) 270 * **MUST point urgent pointer to last byte of urgent data (not right 271 * after). (doesn't, to be like BSD) 272 * MUST inform application layer asynchronously of incoming urgent 273 * data. (does) 274 * MUST provide application with means of determining the amount of 275 * urgent data pending. (does) 276 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 277 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 278 * [Follows BSD 1 byte of urgent data] 279 * 280 * TCP Options (4.2.2.5) 281 * MUST be able to recieve TCP options in any segment. (does) 282 * MUST ignore unsupported options (does) 283 * 284 * Maximum Segment Size Option (4.2.2.6) 285 * MUST implement both sending and receiving MSS. (does) 286 * SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send 287 * it always). (does, even when MSS == 536, which is legal) 288 * MUST assume MSS == 536 if no MSS received at connection setup (does) 289 * MUST calculate "effective send MSS" correctly: 290 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 291 * (does - but allows operator override) 292 * 293 * TCP Checksum (4.2.2.7) 294 * MUST generate and check TCP checksum. (does) 295 * 296 * Initial Sequence Number Selection (4.2.2.8) 297 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 298 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 299 * necessary for 10Mbps networks - and harder than BSD to spoof!) 300 * 301 * Simultaneous Open Attempts (4.2.2.10) 302 * MUST support simultaneous open attempts (does) 303 * 304 * Recovery from Old Duplicate SYN (4.2.2.11) 305 * MUST keep track of active vs. passive open (does) 306 * 307 * RST segment (4.2.2.12) 308 * SHOULD allow an RST segment to contain data (does, but doesn't do 309 * anything with it, which is standard) 310 * 311 * Closing a Connection (4.2.2.13) 312 * MUST inform application of whether connectin was closed by RST or 313 * normal close. (does) 314 * MAY allow "half-duplex" close (treat connection as closed for the 315 * local app, even before handshake is done). (does) 316 * MUST linger in TIME_WAIT for 2 * MSL (does) 317 * 318 * Retransmission Timeout (4.2.2.15) 319 * MUST implement Jacobson's slow start and congestion avoidance 320 * stuff. (does) 321 * 322 * Probing Zero Windows (4.2.2.17) 323 * MUST support probing of zero windows. (does) 324 * MAY keep offered window closed indefinitely. (does) 325 * MUST allow remote window to stay closed indefinitely. (does) 326 * 327 * Passive Open Calls (4.2.2.18) 328 * MUST NOT let new passive open affect other connections. (doesn't) 329 * MUST support passive opens (LISTENs) concurrently. (does) 330 * 331 * Time to Live (4.2.2.19) 332 * MUST make TCP TTL configurable. (does - IP_TTL option) 333 * 334 * Event Processing (4.2.2.20) 335 * SHOULD queue out-of-order segments. (does) 336 * MUST aggregate ACK segments whenever possible. (does but badly) 337 * 338 * Retransmission Timeout Calculation (4.2.3.1) 339 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 340 * calculation. (does, or at least explains them in the comments 8*b) 341 * SHOULD initialize RTO to 0 and RTT to 3. (does) 342 * 343 * When to Send an ACK Segment (4.2.3.2) 344 * SHOULD implement delayed ACK. (does not) 345 * MUST keep ACK delay < 0.5 sec. (N/A) 346 * 347 * When to Send a Window Update (4.2.3.3) 348 * MUST implement receiver-side SWS. (does) 349 * 350 * When to Send Data (4.2.3.4) 351 * MUST implement sender-side SWS. (does - imperfectly) 352 * SHOULD implement Nagle algorithm. (does) 353 * 354 * TCP Connection Failures (4.2.3.5) 355 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 356 * SHOULD inform application layer of soft errors. (doesn't) 357 * 358 * TCP Keep-Alives (4.2.3.6) 359 * MAY provide keep-alives. (does) 360 * MUST make keep-alives configurable on a per-connection basis. (does) 361 * MUST default to no keep-alives. (does) 362 * **MUST make keep-alive interval configurable. (doesn't) 363 * **MUST make default keep-alive interval > 2 hours. (doesn't) 364 * MUST NOT interpret failure to ACK keep-alive packet as dead 365 * connection. (doesn't) 366 * SHOULD send keep-alive with no data. (does) 367 * 368 * TCP Multihoming (4.2.3.7) 369 * MUST get source address from IP layer before sending first 370 * SYN. (does) 371 * MUST use same local address for all segments of a connection. (does) 372 * 373 * IP Options (4.2.3.8) 374 * (I don't think the IP layer sees the IP options, yet.) 375 * MUST ignore unsupported IP options. (does, I guess 8*b) 376 * MAY support Time Stamp and Record Route. (doesn't) 377 * **MUST allow application to specify a source route. (doesn't?) 378 * **MUST allow receieved Source Route option to set route for all future 379 * segments on this connection. (doesn't, not that I think it's a 380 * huge problem) 381 * 382 * ICMP messages (4.2.3.9) 383 * MUST act on ICMP errors. (does) 384 * MUST slow transmission upon receipt of a Source Quench. (does) 385 * MUST NOT abort connection upon receipt of soft Destination 386 * Unreachables (0, 1, 5), Time Exceededs and Parameter 387 * Problems. (doesn't) 388 * SHOULD report soft Destination Unreachables etc. to the 389 * application. (doesn't) 390 * SHOULD abort connection upon receipt of hard Destination Unreachable 391 * messages (2, 3, 4). (does) 392 * 393 * Remote Address Validation (4.2.3.10) 394 * MUST reject as an error OPEN for invalid remote IP address. (does) 395 * MUST ignore SYN with invalid source address. (does) 396 * MUST silently discard incoming SYN for broadcast/multicast 397 * address. (does) 398 * 399 * Asynchronous Reports (4.2.4.1) 400 * **MUST provide mechanism for reporting soft errors to application 401 * layer. (doesn't) 402 * 403 * Type of Service (4.2.4.2) 404 * MUST allow application layer to set Type of Service. (does IP_TOS) 405 * 406 * (Whew. -- MS 950903) 407 **/ 408
409 #include <linux/types.h>
410 #include <linux/sched.h>
411 #include <linux/mm.h>
412 #include <linux/time.h>
413 #include <linux/string.h>
414 #include <linux/config.h>
415 #include <linux/socket.h>
416 #include <linux/sockios.h>
417 #include <linux/termios.h>
418 #include <linux/in.h>
419 #include <linux/fcntl.h>
420 #include <linux/inet.h>
421 #include <linux/netdevice.h>
422 #include <net/snmp.h>
423 #include <net/ip.h>
424 #include <net/protocol.h>
425 #include <net/icmp.h>
426 #include <net/tcp.h>
427 #include <net/arp.h>
428 #include <linux/skbuff.h>
429 #include <net/sock.h>
430 #include <net/route.h>
431 #include <linux/errno.h>
432 #include <linux/timer.h>
433 #include <asm/system.h>
434 #include <asm/segment.h>
435 #include <linux/mm.h>
436 #include <net/checksum.h>
437
438 /* 439 * The MSL timer is the 'normal' timer. 440 */ 441
442 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
443
444 #define SEQ_TICK 3
445 unsignedlongseq_offset;
446 structtcp_mibtcp_statistics;
447
448 /* 449 * Cached last hit socket 450 */ 451
452 volatileunsignedlongth_cache_saddr,th_cache_daddr;
453 volatileunsignedshortth_cache_dport, th_cache_sport;
454 volatilestructsock *th_cache_sk;
455
456 voidtcp_cache_zap(void)
/* */ 457 { 458 unsignedlongflags;
459 save_flags(flags);
460 cli();
461 th_cache_saddr=0;
462 th_cache_daddr=0;
463 th_cache_dport=0;
464 th_cache_sport=0;
465 th_cache_sk=NULL;
466 restore_flags(flags);
467 } 468
469 staticvoidtcp_close(structsock *sk, inttimeout);
470
471
472 /* 473 * The less said about this the better, but it works and will do for 1.2 474 */ 475
476 staticstructwait_queue *master_select_wakeup;
477
478 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 479 { 480 if (a < b)
481 return(a);
482 return(b);
483 } 484
485 #undefSTATE_TRACE 486
487 #ifdefSTATE_TRACE 488 staticchar *statename[]={ 489 "Unused","Established","Syn Sent","Syn Recv",
490 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
491 "Close Wait","Last ACK","Listen","Closing"
492 };
493 #endif 494
495 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 496 { 497 if(sk->state==TCP_ESTABLISHED)
498 tcp_statistics.TcpCurrEstab--;
499 #ifdefSTATE_TRACE 500 if(sk->debug)
501 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
502 #endif 503 /* This is a hack but it doesn't occur often and it's going to 504 be a real to fix nicely */ 505
506 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
507 { 508 wake_up_interruptible(&master_select_wakeup);
509 } 510 sk->state=state;
511 if(state==TCP_ESTABLISHED)
512 tcp_statistics.TcpCurrEstab++;
513 if(sk->state==TCP_CLOSE)
514 tcp_cache_zap();
515 } 516
517 /* 518 * This routine picks a TCP windows for a socket based on 519 * the following constraints 520 * 521 * 1. The window can never be shrunk once it is offered (RFC 793) 522 * 2. We limit memory per socket 523 * 524 * For now we use NET2E3's heuristic of offering half the memory 525 * we have handy. All is not as bad as this seems however because 526 * of two things. Firstly we will bin packets even within the window 527 * in order to get the data we are waiting for into the memory limit. 528 * Secondly we bin common duplicate forms at receive time 529 * Better heuristics welcome 530 */ 531
532 inttcp_select_window(structsock *sk)
/* */ 533 { 534 intnew_window = sock_rspace(sk);
535
536 if(sk->window_clamp)
537 new_window=min(sk->window_clamp,new_window);
538 /* 539 * Two things are going on here. First, we don't ever offer a 540 * window less than min(sk->mss, MAX_WINDOW/2). This is the 541 * receiver side of SWS as specified in RFC1122. 542 * Second, we always give them at least the window they 543 * had before, in order to avoid retracting window. This 544 * is technically allowed, but RFC1122 advises against it and 545 * in practice it causes trouble. 546 * 547 * Fixme: This doesn't correctly handle the case where 548 * new_window > sk->window but not by enough to allow for the 549 * shift in sequence space. 550 */ 551 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
552 return(sk->window);
553 return(new_window);
554 } 555
556 /* 557 * Find someone to 'accept'. Must be called with 558 * sk->inuse=1 or cli() 559 */ 560
561 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 562 { 563 structsk_buff *p=skb_peek(&s->receive_queue);
564 if(p==NULL)
565 returnNULL;
566 do 567 { 568 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
569 returnp;
570 p=p->next;
571 } 572 while(p!=(structsk_buff *)&s->receive_queue);
573 returnNULL;
574 } 575
576 /* 577 * Remove a completed connection and return it. This is used by 578 * tcp_accept() to get connections from the queue. 579 */ 580
581 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 582 { 583 structsk_buff *skb;
584 unsignedlongflags;
585 save_flags(flags);
586 cli();
587 skb=tcp_find_established(s);
588 if(skb!=NULL)
589 skb_unlink(skb); /* Take it off the queue */ 590 restore_flags(flags);
591 returnskb;
592 } 593
594 /* 595 * This routine closes sockets which have been at least partially 596 * opened, but not yet accepted. Currently it is only called by 597 * tcp_close, and timeout mirrors the value there. 598 */ 599
600 staticvoidtcp_close_pending (structsock *sk)
/* */ 601 { 602 structsk_buff *skb;
603
604 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
605 { 606 skb->sk->dead=1;
607 tcp_close(skb->sk, 0);
608 kfree_skb(skb, FREE_READ);
609 } 610 return;
611 } 612
613 /* 614 * Enter the time wait state. 615 */ 616
617 staticvoidtcp_time_wait(structsock *sk)
/* */ 618 { 619 tcp_set_state(sk,TCP_TIME_WAIT);
620 sk->shutdown = SHUTDOWN_MASK;
621 if (!sk->dead)
622 sk->state_change(sk);
623 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
624 } 625
626 /* 627 * A socket has timed out on its send queue and wants to do a 628 * little retransmitting. Currently this means TCP. 629 */ 630
631 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 632 { 633 structsk_buff * skb;
634 structproto *prot;
635 structdevice *dev;
636 intct=0;
637 structrtable *rt;
638
639 prot = sk->prot;
640 skb = sk->send_head;
641
642 while (skb != NULL)
643 { 644 structtcphdr *th;
645 structiphdr *iph;
646 intsize;
647
648 dev = skb->dev;
649 IS_SKB(skb);
650 skb->when = jiffies;
651
652 /* 653 * Discard the surplus MAC header 654 */ 655
656 skb_pull(skb,((unsignedchar *)skb->ip_hdr)-skb->data);
657
658 /* 659 * In general it's OK just to use the old packet. However we 660 * need to use the current ack and window fields. Urg and 661 * urg_ptr could possibly stand to be updated as well, but we 662 * don't keep the necessary data. That shouldn't be a problem, 663 * if the other end is doing the right thing. Since we're 664 * changing the packet, we have to issue a new IP identifier. 665 */ 666
667 iph = (structiphdr *)skb->data;
668 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
669 size = ntohs(iph->tot_len) - (iph->ihl<<2);
670
671 /* 672 * Note: We ought to check for window limits here but 673 * currently this is done (less efficiently) elsewhere. 674 */ 675
676 iph->id = htons(ip_id_count++);
677 ip_send_check(iph);
678
679 /* 680 * Put a MAC header back on (may cause ARPing) 681 */ 682
683 if(skb->localroute)
684 rt=ip_rt_local(iph->daddr,NULL,NULL);
685 else 686 rt=ip_rt_route(iph->daddr,NULL,NULL);
687
688 if(rt==NULL) /* Deep poo */ 689 { 690 if(skb->sk)
691 { 692 skb->sk->err=ENETUNREACH;
693 skb->sk->error_report(skb->sk);
694 } 695 } 696 else 697 { 698 dev=rt->rt_dev;
699 skb->raddr=rt->rt_gateway;
700 if(skb->raddr==0)
701 skb->raddr=iph->daddr;
702 skb->dev=dev;
703 skb->arp=1;
704 if(dev->hard_header)
705 { 706 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
707 skb->arp=0;
708 } 709
710 /* 711 * This is not the right way to handle this. We have to 712 * issue an up to date window and ack report with this 713 * retransmit to keep the odd buggy tcp that relies on 714 * the fact BSD does this happy. 715 * We don't however need to recalculate the entire 716 * checksum, so someone wanting a small problem to play 717 * with might like to implement RFC1141/RFC1624 and speed 718 * this up by avoiding a full checksum. 719 */ 720
721 th->ack_seq = ntohl(sk->acked_seq);
722 th->window = ntohs(tcp_select_window(sk));
723 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
724
725 /* 726 * If the interface is (still) up and running, kick it. 727 */ 728
729 if (dev->flags & IFF_UP)
730 { 731 /* 732 * If the packet is still being sent by the device/protocol 733 * below then don't retransmit. This is both needed, and good - 734 * especially with connected mode AX.25 where it stops resends 735 * occurring of an as yet unsent anyway frame! 736 * We still add up the counts as the round trip time wants 737 * adjusting. 738 */ 739 if (sk && !skb_device_locked(skb))
740 { 741 /* Remove it from any existing driver queue first! */ 742 skb_unlink(skb);
743 /* Now queue it */ 744 ip_statistics.IpOutRequests++;
745 dev_queue_xmit(skb, dev, sk->priority);
746 } 747 } 748 } 749
750 /* 751 * Count retransmissions 752 */ 753
754 ct++;
755 sk->prot->retransmits ++;
756 tcp_statistics.TcpRetransSegs++;
757
758
759 /* 760 * Only one retransmit requested. 761 */ 762
763 if (!all)
764 break;
765
766 /* 767 * This should cut it off before we send too many packets. 768 */ 769
770 if (ct >= sk->cong_window)
771 break;
772 skb = skb->link3;
773 } 774 } 775
776 /* 777 * Reset the retransmission timer 778 */ 779
780 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 781 { 782 del_timer(&sk->retransmit_timer);
783 sk->ip_xmit_timeout = why;
784 if((int)when < 0)
785 { 786 when=3;
787 printk("Error: Negative timer in xmit_timer\n");
788 } 789 sk->retransmit_timer.expires=jiffies+when;
790 add_timer(&sk->retransmit_timer);
791 } 792
793 /* 794 * This is the normal code called for timeouts. It does the retransmission 795 * and then does backoff. tcp_do_retransmit is separated out because 796 * tcp_ack needs to send stuff from the retransmit queue without 797 * initiating a backoff. 798 */ 799
800
801 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 802 { 803 tcp_do_retransmit(sk, all);
804
805 /* 806 * Increase the timeout each time we retransmit. Note that 807 * we do not increase the rtt estimate. rto is initialized 808 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 809 * that doubling rto each time is the least we can get away with. 810 * In KA9Q, Karn uses this for the first few times, and then 811 * goes to quadratic. netBSD doubles, but only goes up to *64, 812 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 813 * defined in the protocol as the maximum possible RTT. I guess 814 * we'll have to use something other than TCP to talk to the 815 * University of Mars. 816 * 817 * PAWS allows us longer timeouts and large windows, so once 818 * implemented ftp to mars will work nicely. We will have to fix 819 * the 120 second clamps though! 820 */ 821
822 sk->retransmits++;
823 sk->prot->retransmits++;
824 sk->backoff++;
825 sk->rto = min(sk->rto << 1, 120*HZ);
826 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
827 } 828
829
830 /* 831 * A timer event has trigger a tcp retransmit timeout. The 832 * socket xmit queue is ready and set up to send. Because 833 * the ack receive code keeps the queue straight we do 834 * nothing clever here. 835 */ 836
837 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 838 { 839 if (all)
840 { 841 tcp_retransmit_time(sk, all);
842 return;
843 } 844
845 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 846 /* sk->ssthresh in theory can be zero. I guess that's OK */ 847 sk->cong_count = 0;
848
849 sk->cong_window = 1;
850
851 /* Do the actual retransmit. */ 852 tcp_retransmit_time(sk, all);
853 } 854
855 /* 856 * A write timeout has occurred. Process the after effects. 857 */ 858
859 staticinttcp_write_timeout(structsock *sk)
/* */ 860 { 861 /* 862 * Look for a 'soft' timeout. 863 */ 864 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
865 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
866 { 867 /* 868 * Attempt to recover if arp has changed (unlikely!) or 869 * a route has shifted (not supported prior to 1.3). 870 */ 871 arp_destroy (sk->daddr, 0);
872 /*ip_route_check (sk->daddr);*/ 873 } 874
875 /* 876 * Have we tried to SYN too many times (repent repent 8)) 877 */ 878
879 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
880 { 881 sk->err=ETIMEDOUT;
882 sk->error_report(sk);
883 del_timer(&sk->retransmit_timer);
884 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ 885 tcp_set_state(sk,TCP_CLOSE);
886 /* Don't FIN, we got nothing back */ 887 release_sock(sk);
888 return 0;
889 } 890 /* 891 * Has it gone just too far ? 892 */ 893 if (sk->retransmits > TCP_RETR2)
894 { 895 sk->err = ETIMEDOUT;
896 sk->error_report(sk);
897 del_timer(&sk->retransmit_timer);
898 /* 899 * Time wait the socket 900 */ 901 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
902 { 903 tcp_set_state(sk,TCP_TIME_WAIT);
904 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
905 } 906 else 907 { 908 /* 909 * Clean up time. 910 */ 911 tcp_set_state(sk, TCP_CLOSE);
912 release_sock(sk);
913 return 0;
914 } 915 } 916 return 1;
917 } 918
919 /* 920 * The TCP retransmit timer. This lacks a few small details. 921 * 922 * 1. An initial rtt timeout on the probe0 should cause what we can 923 * of the first write queue buffer to be split and sent. 924 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 925 * ETIMEDOUT if we know an additional 'soft' error caused this. 926 * tcp_err should save a 'soft error' for us. 927 */ 928
929 staticvoidretransmit_timer(unsignedlongdata)
/* */ 930 { 931 structsock *sk = (structsock*)data;
932 intwhy = sk->ip_xmit_timeout;
933
934 /* 935 * only process if socket is not in use 936 */ 937
938 cli();
939 if (sk->inuse || in_bh)
940 { 941 /* Try again in 1 second */ 942 sk->retransmit_timer.expires = jiffies+HZ;
943 add_timer(&sk->retransmit_timer);
944 sti();
945 return;
946 } 947
948 sk->inuse = 1;
949 sti();
950
951 /* Always see if we need to send an ack. */ 952
953 if (sk->ack_backlog && !sk->zapped)
954 { 955 sk->prot->read_wakeup (sk);
956 if (! sk->dead)
957 sk->data_ready(sk,0);
958 } 959
960 /* Now we need to figure out why the socket was on the timer. */ 961
962 switch (why)
963 { 964 /* Window probing */ 965 caseTIME_PROBE0:
966 tcp_send_probe0(sk);
967 tcp_write_timeout(sk);
968 break;
969 /* Retransmitting */ 970 caseTIME_WRITE:
971 /* It could be we got here because we needed to send an ack. 972 * So we need to check for that. 973 */ 974 { 975 structsk_buff *skb;
976 unsignedlongflags;
977
978 save_flags(flags);
979 cli();
980 skb = sk->send_head;
981 if (!skb)
982 { 983 restore_flags(flags);
984 } 985 else 986 { 987 /* 988 * Kicked by a delayed ack. Reset timer 989 * correctly now 990 */ 991 if (jiffies < skb->when + sk->rto)
992 { 993 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
994 restore_flags(flags);
995 break;
996 } 997 restore_flags(flags);
998 /* 999 * Retransmission1000 */1001 sk->retransmits++;
1002 sk->prot->retransmits++;
1003 sk->prot->retransmit (sk, 0);
1004 tcp_write_timeout(sk);
1005 }1006 break;
1007 }1008 /* Sending Keepalives */1009 caseTIME_KEEPOPEN:
1010 /* 1011 * this reset_timer() call is a hack, this is not1012 * how KEEPOPEN is supposed to work.1013 */1014 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1015
1016 /* Send something to keep the connection open. */1017 if (sk->prot->write_wakeup)
1018 sk->prot->write_wakeup (sk);
1019 sk->retransmits++;
1020 sk->prot->retransmits++;
1021 tcp_write_timeout(sk);
1022 break;
1023 default:
1024 printk ("rexmit_timer: timer expired - reason unknown\n");
1025 break;
1026 }1027 release_sock(sk);
1028 }1029
1030 /*1031 * This routine is called by the ICMP module when it gets some1032 * sort of error condition. If err < 0 then the socket should1033 * be closed and the error returned to the user. If err > 01034 * it's just the icmp type << 8 | icmp code. After adjustment1035 * header points to the first 8 bytes of the tcp header. We need1036 * to find the appropriate port.1037 */1038
1039 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */1040 __u32saddr, structinet_protocol *protocol)
1041 {1042 structtcphdr *th;
1043 structsock *sk;
1044 structiphdr *iph=(structiphdr *)header;
1045
1046 header+=4*iph->ihl;
1047
1048
1049 th =(structtcphdr *)header;
1050 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1051
1052 if (sk == NULL)
1053 return;
1054
1055 if (type == ICMP_SOURCE_QUENCH)
1056 {1057 /*1058 * FIXME:1059 * For now we will just trigger a linear backoff.1060 * The slow start code should cause a real backoff here.1061 */1062 if (sk->cong_window > 4)
1063 sk->cong_window--;
1064 return;
1065 }1066
1067 if (type == ICMP_PARAMETERPROB)
1068 {1069 sk->err=EPROTO;
1070 sk->error_report(sk);
1071 }1072
1073 /*1074 * If we've already connected we will keep trying1075 * until we time out, or the user gives up.1076 */1077
1078 if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1079 {1080 sk->err = icmp_err_convert[code].errno;
1081 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1082 {1083 tcp_statistics.TcpAttemptFails++;
1084 tcp_set_state(sk,TCP_CLOSE);
1085 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */1086 }1087 }1088 return;
1089 }1090
1091
1092 /*1093 * Walk down the receive queue counting readable data until we hit the end or we find a gap1094 * in the received data queue (ie a frame missing that needs sending to us). Not1095 * sorting using two queues as data arrives makes life so much harder.1096 */1097
1098 staticinttcp_readable(structsock *sk)
/* */1099 {1100 unsignedlongcounted;
1101 unsignedlongamount;
1102 structsk_buff *skb;
1103 intsum;
1104 unsignedlongflags;
1105
1106 if(sk && sk->debug)
1107 printk("tcp_readable: %p - ",sk);
1108
1109 save_flags(flags);
1110 cli();
1111 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1112 {1113 restore_flags(flags);
1114 if(sk && sk->debug)
1115 printk("empty\n");
1116 return(0);
1117 }1118
1119 counted = sk->copied_seq; /* Where we are at the moment */1120 amount = 0;
1121
1122 /* 1123 * Do until a push or until we are out of data. 1124 */1125
1126 do1127 {1128 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */1129 break;
1130 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */1131 if (skb->h.th->syn)
1132 sum++;
1133 if (sum > 0)
1134 {/* Add it up, move on */1135 amount += sum;
1136 if (skb->h.th->syn)
1137 amount--;
1138 counted += sum;
1139 }1140 /*1141 * Don't count urg data ... but do it in the right place!1142 * Consider: "old_data (ptr is here) URG PUSH data"1143 * The old code would stop at the first push because1144 * it counted the urg (amount==1) and then does amount--1145 * *after* the loop. This means tcp_readable() always1146 * returned zero if any URG PUSH was in the queue, even1147 * though there was normal data available. If we subtract1148 * the urg data right here, we even get it to work for more1149 * than one URG PUSH skb without normal data.1150 * This means that select() finally works now with urg data1151 * in the queue. Note that rlogin was never affected1152 * because it doesn't use select(); it uses two processes1153 * and a blocking read(). And the queue scan in tcp_read()1154 * was correct. Mike <pall@rz.uni-karlsruhe.de>1155 */1156 if (skb->h.th->urg)
1157 amount--; /* don't count urg data */1158 if (amount && skb->h.th->psh) break;
1159 skb = skb->next;
1160 }1161 while(skb != (structsk_buff *)&sk->receive_queue);
1162
1163 restore_flags(flags);
1164 if(sk->debug)
1165 printk("got %lu bytes.\n",amount);
1166 return(amount);
1167 }1168
1169 /*1170 * LISTEN is a special case for select..1171 */1172 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */1173 {1174 if (sel_type == SEL_IN) {1175 intretval;
1176
1177 sk->inuse = 1;
1178 retval = (tcp_find_established(sk) != NULL);
1179 release_sock(sk);
1180 if (!retval)
1181 select_wait(&master_select_wakeup,wait);
1182 returnretval;
1183 }1184 return 0;
1185 }1186
1187
1188 /*1189 * Wait for a TCP event.1190 *1191 * Note that we don't need to set "sk->inuse", as the upper select layers1192 * take care of normal races (between the test and the event) and we don't1193 * go look at any of the socket buffers directly.1194 */1195 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */1196 {1197 if (sk->state == TCP_LISTEN)
1198 returntcp_listen_select(sk, sel_type, wait);
1199
1200 switch(sel_type) {1201 caseSEL_IN:
1202 if (sk->err)
1203 return 1;
1204 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1205 break;
1206
1207 if (sk->shutdown & RCV_SHUTDOWN)
1208 return 1;
1209
1210 if (sk->acked_seq == sk->copied_seq)
1211 break;
1212
1213 if (sk->urg_seq != sk->copied_seq ||
1214 sk->acked_seq != sk->copied_seq+1 ||
1215 sk->urginline || !sk->urg_data)
1216 return 1;
1217 break;
1218
1219 caseSEL_OUT:
1220 if (sk->err)
1221 return 1;
1222 if (sk->shutdown & SEND_SHUTDOWN)
1223 return 0;
1224 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1225 break;
1226 /*1227 * This is now right thanks to a small fix1228 * by Matt Dillon.1229 */1230
1231 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1232 break;
1233 return 1;
1234
1235 caseSEL_EX:
1236 if (sk->urg_data)
1237 return 1;
1238 break;
1239 }1240 select_wait(sk->sleep, wait);
1241 return 0;
1242 }1243
1244 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */1245 {1246 interr;
1247 switch(cmd)
1248 {1249
1250 caseTIOCINQ:
1251 #ifdef FIXME /* FIXME: */1252 caseFIONREAD:
1253 #endif1254 {1255 unsignedlongamount;
1256
1257 if (sk->state == TCP_LISTEN)
1258 return(-EINVAL);
1259
1260 sk->inuse = 1;
1261 amount = tcp_readable(sk);
1262 release_sock(sk);
1263 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1264 if(err)
1265 returnerr;
1266 put_user(amount, (int *)arg);
1267 return(0);
1268 }1269 caseSIOCATMARK:
1270 {1271 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1272
1273 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1274 if (err)
1275 returnerr;
1276 put_user(answ,(int *) arg);
1277 return(0);
1278 }1279 caseTIOCOUTQ:
1280 {1281 unsignedlongamount;
1282
1283 if (sk->state == TCP_LISTEN) return(-EINVAL);
1284 amount = sock_wspace(sk);
1285 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1286 if(err)
1287 returnerr;
1288 put_user(amount, (int *)arg);
1289 return(0);
1290 }1291 default:
1292 return(-EINVAL);
1293 }1294 }1295
1296
1297 /*1298 * This routine computes a TCP checksum. 1299 *1300 * Modified January 1995 from a go-faster DOS routine by1301 * Jorge Cwik <jorge@laser.satlink.net>1302 */1303
1304 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1305 unsignedlongsaddr, unsignedlongdaddr, unsignedlongbase)
1306 {1307 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1308 }1309
1310
1311
1312 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1313 unsignedlongdaddr, intlen, structsock *sk)
1314 {1315 th->check = 0;
1316 th->check = tcp_check(th, len, saddr, daddr,
1317 csum_partial((char *)th,len,0));
1318 return;
1319 }1320
1321 /*1322 * This is the main buffer sending routine. We queue the buffer1323 * having checked it is sane seeming.1324 */1325
1326 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1327 {1328 intsize;
1329 structtcphdr * th = skb->h.th;
1330
1331 /*1332 * length of packet (not counting length of pre-tcp headers) 1333 */1334
1335 size = skb->len - ((unsignedchar *) th - skb->data);
1336
1337 /*1338 * Sanity check it.. 1339 */1340
1341 if (size < sizeof(structtcphdr) || size > skb->len)
1342 {1343 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1344 skb, skb->data, th, skb->len);
1345 kfree_skb(skb, FREE_WRITE);
1346 return;
1347 }1348
1349 /*1350 * If we have queued a header size packet.. (these crash a few1351 * tcp stacks if ack is not set)1352 */1353
1354 if (size == sizeof(structtcphdr))
1355 {1356 /* If it's got a syn or fin it's notionally included in the size..*/1357 if(!th->syn && !th->fin)
1358 {1359 printk("tcp_send_skb: attempt to queue a bogon.\n");
1360 kfree_skb(skb,FREE_WRITE);
1361 return;
1362 }1363 }1364
1365 /*1366 * Actual processing.1367 */1368
1369 tcp_statistics.TcpOutSegs++;
1370 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1371
1372 /*1373 * We must queue if1374 *1375 * a) The right edge of this frame exceeds the window1376 * b) We are retransmitting (Nagle's rule)1377 * c) We have too many packets 'in flight'1378 */1379
1380 if (after(skb->h.seq, sk->window_seq) ||
1381 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1382 sk->packets_out >= sk->cong_window)
1383 {1384 /* checksum will be supplied by tcp_write_xmit. So1385 * we shouldn't need to set it at all. I'm being paranoid */1386 th->check = 0;
1387 if (skb->next != NULL)
1388 {1389 printk("tcp_send_partial: next != NULL\n");
1390 skb_unlink(skb);
1391 }1392 skb_queue_tail(&sk->write_queue, skb);
1393
1394 /*1395 * If we don't fit we have to start the zero window1396 * probes. This is broken - we really need to do a partial1397 * send _first_ (This is what causes the Cisco and PC/TCP1398 * grief).1399 */1400
1401 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1402 sk->send_head == NULL && sk->ack_backlog == 0)
1403 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1404 }1405 else1406 {1407 /*1408 * This is going straight out1409 */1410
1411 th->ack_seq = ntohl(sk->acked_seq);
1412 th->window = ntohs(tcp_select_window(sk));
1413
1414 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1415
1416 sk->sent_seq = sk->write_seq;
1417
1418 /*1419 * This is mad. The tcp retransmit queue is put together1420 * by the ip layer. This causes half the problems with1421 * unroutable FIN's and other things.1422 */1423
1424 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1425
1426 /*1427 * Set for next retransmit based on expected ACK time.1428 * FIXME: We set this every time which means our 1429 * retransmits are really about a window behind.1430 */1431
1432 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1433 }1434 }1435
1436 /*1437 * Locking problems lead us to a messy situation where we can have1438 * multiple partially complete buffers queued up. This is really bad1439 * as we don't want to be sending partial buffers. Fix this with1440 * a semaphore or similar to lock tcp_write per socket.1441 *1442 * These routines are pretty self descriptive.1443 */1444
1445 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1446 {1447 structsk_buff * skb;
1448 unsignedlongflags;
1449
1450 save_flags(flags);
1451 cli();
1452 skb = sk->partial;
1453 if (skb) {1454 sk->partial = NULL;
1455 del_timer(&sk->partial_timer);
1456 }1457 restore_flags(flags);
1458 returnskb;
1459 }1460
1461 /*1462 * Empty the partial queue1463 */1464
1465 staticvoidtcp_send_partial(structsock *sk)
/* */1466 {1467 structsk_buff *skb;
1468
1469 if (sk == NULL)
1470 return;
1471 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1472 tcp_send_skb(sk, skb);
1473 }1474
1475 /*1476 * Queue a partial frame1477 */1478
1479 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1480 {1481 structsk_buff * tmp;
1482 unsignedlongflags;
1483
1484 save_flags(flags);
1485 cli();
1486 tmp = sk->partial;
1487 if (tmp)
1488 del_timer(&sk->partial_timer);
1489 sk->partial = skb;
1490 init_timer(&sk->partial_timer);
1491 /*1492 * Wait up to 1 second for the buffer to fill.1493 */1494 sk->partial_timer.expires = jiffies+HZ;
1495 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1496 sk->partial_timer.data = (unsignedlong) sk;
1497 add_timer(&sk->partial_timer);
1498 restore_flags(flags);
1499 if (tmp)
1500 tcp_send_skb(sk, tmp);
1501 }1502
1503
1504 /*1505 * This routine sends an ack and also updates the window. 1506 */1507
1508 staticvoidtcp_send_ack(u32sequence, u32ack,
/* */1509 structsock *sk,
1510 structtcphdr *th, unsignedlongdaddr)
1511 {1512 structsk_buff *buff;
1513 structtcphdr *t1;
1514 structdevice *dev = NULL;
1515 inttmp;
1516
1517 if(sk->zapped)
1518 return; /* We have been reset, we may not send again */1519
1520 /*1521 * We need to grab some memory, and put together an ack,1522 * and then put it into the queue to be sent.1523 */1524
1525 buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1526 if (buff == NULL)
1527 {1528 /* 1529 * Force it to send an ack. We don't have to do this1530 * (ACK is unreliable) but it's much better use of 1531 * bandwidth on slow links to send a spare ack than1532 * resend packets. 1533 */1534
1535 sk->ack_backlog++;
1536 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1537 {1538 reset_xmit_timer(sk, TIME_WRITE, HZ);
1539 }1540 return;
1541 }1542
1543 /*1544 * Assemble a suitable TCP frame1545 */1546
1547 buff->sk = sk;
1548 buff->localroute = sk->localroute;
1549
1550 /* 1551 * Put in the IP header and routing stuff. 1552 */1553
1554 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1555 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1556 if (tmp < 0)
1557 {1558 buff->free = 1;
1559 sock_wfree(sk, buff);
1560 return;
1561 }1562 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1563
1564 memcpy(t1, th, sizeof(*t1));
1565
1566 /*1567 * Swap the send and the receive. 1568 */1569
1570 t1->dest = th->source;
1571 t1->source = th->dest;
1572 t1->seq = ntohl(sequence);
1573 t1->ack = 1;
1574 sk->window = tcp_select_window(sk);
1575 t1->window = ntohs(sk->window);
1576 t1->res1 = 0;
1577 t1->res2 = 0;
1578 t1->rst = 0;
1579 t1->urg = 0;
1580 t1->syn = 0;
1581 t1->psh = 0;
1582 t1->fin = 0;
1583
1584 /*1585 * If we have nothing queued for transmit and the transmit timer1586 * is on we are just doing an ACK timeout and need to switch1587 * to a keepalive.1588 */1589
1590 if (ack == sk->acked_seq)
1591 {1592 sk->ack_backlog = 0;
1593 sk->bytes_rcv = 0;
1594 sk->ack_timed = 0;
1595 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1596 && sk->ip_xmit_timeout == TIME_WRITE)
1597 {1598 if(sk->keepopen) {1599 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1600 }else{1601 delete_timer(sk);
1602 }1603 }1604 }1605
1606 /*1607 * Fill in the packet and send it1608 */1609
1610 t1->ack_seq = ntohl(ack);
1611 t1->doff = sizeof(*t1)/4;
1612 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1613 if (sk->debug)
1614 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1615 tcp_statistics.TcpOutSegs++;
1616 sk->prot->queue_xmit(sk, dev, buff, 1);
1617 }1618
1619
1620 /* 1621 * This routine builds a generic TCP header. 1622 */1623
1624 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1625 {1626
1627 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1628 th->seq = htonl(sk->write_seq);
1629 th->psh =(push == 0) ? 1 : 0;
1630 th->doff = sizeof(*th)/4;
1631 th->ack = 1;
1632 th->fin = 0;
1633 sk->ack_backlog = 0;
1634 sk->bytes_rcv = 0;
1635 sk->ack_timed = 0;
1636 th->ack_seq = htonl(sk->acked_seq);
1637 sk->window = tcp_select_window(sk);
1638 th->window = htons(sk->window);
1639
1640 return(sizeof(*th));
1641 }1642
1643 /*1644 * This routine copies from a user buffer into a socket,1645 * and starts the transmit system.1646 */1647
1648 staticinttcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */1649 intlen, intnonblock, intflags)
1650 {1651 intcopied = 0;
1652 intcopy;
1653 inttmp;
1654 intseglen;
1655 intiovct=0;
1656 structsk_buff *skb;
1657 structsk_buff *send_tmp;
1658 structproto *prot;
1659 structdevice *dev = NULL;
1660 unsignedchar *from;
1661
1662 /*1663 * Do sanity checking for sendmsg/sendto/send1664 */1665
1666 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1667 return -EINVAL;
1668 if (msg->msg_name)
1669 {1670 structsockaddr_in *addr=(structsockaddr_in *)msg->msg_name;
1671 if(sk->state == TCP_CLOSE)
1672 return -ENOTCONN;
1673 if (msg->msg_namelen < sizeof(*addr))
1674 return -EINVAL;
1675 if (addr->sin_family && addr->sin_family != AF_INET)
1676 return -EINVAL;
1677 if (addr->sin_port != sk->dummy_th.dest)
1678 return -EISCONN;
1679 if (addr->sin_addr.s_addr != sk->daddr)
1680 return -EISCONN;
1681 }1682
1683 /*1684 * Ok commence sending1685 */1686
1687 while(iovct<msg->msg_iovlen)
1688 {1689 seglen=msg->msg_iov[iovct].iov_len;
1690 from=msg->msg_iov[iovct++].iov_base;
1691 sk->inuse=1;
1692 prot = sk->prot;
1693 while(seglen > 0)
1694 {1695 if (sk->err)
1696 {/* Stop on an error */1697 release_sock(sk);
1698 if (copied)
1699 return(copied);
1700 tmp = -sk->err;
1701 sk->err = 0;
1702 return(tmp);
1703 }1704
1705 /*1706 * First thing we do is make sure that we are established. 1707 */1708
1709 if (sk->shutdown & SEND_SHUTDOWN)
1710 {1711 release_sock(sk);
1712 sk->err = EPIPE;
1713 if (copied)
1714 return(copied);
1715 sk->err = 0;
1716 return(-EPIPE);
1717 }1718
1719 /* 1720 * Wait for a connection to finish.1721 */1722
1723 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1724 {1725 if (sk->err)
1726 {1727 release_sock(sk);
1728 if (copied)
1729 return(copied);
1730 tmp = -sk->err;
1731 sk->err = 0;
1732 return(tmp);
1733 }1734
1735 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1736 {1737 release_sock(sk);
1738 if (copied)
1739 return(copied);
1740
1741 if (sk->err)
1742 {1743 tmp = -sk->err;
1744 sk->err = 0;
1745 return(tmp);
1746 }1747
1748 if (sk->keepopen)
1749 {1750 send_sig(SIGPIPE, current, 0);
1751 }1752 return(-EPIPE);
1753 }1754
1755 if (nonblock || copied)
1756 {1757 release_sock(sk);
1758 if (copied)
1759 return(copied);
1760 return(-EAGAIN);
1761 }1762
1763 release_sock(sk);
1764 cli();
1765
1766 if (sk->state != TCP_ESTABLISHED &&
1767 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1768 {1769 interruptible_sleep_on(sk->sleep);
1770 if (current->signal & ~current->blocked)
1771 {1772 sti();
1773 if (copied)
1774 return(copied);
1775 return(-ERESTARTSYS);
1776 }1777 }1778 sk->inuse = 1;
1779 sti();
1780 }1781
1782 /*1783 * The following code can result in copy <= if sk->mss is ever1784 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1785 * sk->mtu is constant once SYN processing is finished. I.e. we1786 * had better not get here until we've seen his SYN and at least one1787 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1788 * But ESTABLISHED should guarantee that. sk->max_window is by definition1789 * non-decreasing. Note that any ioctl to set user_mss must be done1790 * before the exchange of SYN's. If the initial ack from the other1791 * end has a window of 0, max_window and thus mss will both be 0.1792 */1793
1794 /* 1795 * Now we need to check if we have a half built packet. 1796 */1797
1798 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1799 {1800 inthdrlen;
1801
1802 /* IP header + TCP header */1803 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1804 + sizeof(structtcphdr);
1805
1806 /* Add more stuff to the end of skb->len */1807 if (!(flags & MSG_OOB))
1808 {1809 copy = min(sk->mss - (skb->len - hdrlen), len);
1810 /* FIXME: this is really a bug. */1811 if (copy <= 0)
1812 {1813 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1814 copy = 0;
1815 }1816 memcpy_fromfs(skb_put(skb,copy), from, copy);
1817 from += copy;
1818 copied += copy;
1819 len -= copy;
1820 sk->write_seq += copy;
1821 seglen -= copy;
1822 }1823 if ((skb->len - hdrlen) >= sk->mss ||
1824 (flags & MSG_OOB) || !sk->packets_out)
1825 tcp_send_skb(sk, skb);
1826 else1827 tcp_enqueue_partial(skb, sk);
1828 continue;
1829 }1830
1831 /*1832 * We also need to worry about the window.1833 * If window < 1/2 the maximum window we've seen from this1834 * host, don't use it. This is sender side1835 * silly window prevention, as specified in RFC1122.1836 * (Note that this is different than earlier versions of1837 * SWS prevention, e.g. RFC813.). What we actually do is 1838 * use the whole MSS. Since the results in the right1839 * edge of the packet being outside the window, it will1840 * be queued for later rather than sent.1841 */1842
1843 copy = sk->window_seq - sk->write_seq;
1844 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1845 copy = sk->mss;
1846 if (copy > len)
1847 copy = len;
1848
1849 /*1850 * We should really check the window here also. 1851 */1852
1853 send_tmp = NULL;
1854 if (copy < sk->mss && !(flags & MSG_OOB))
1855 {1856 /*1857 * We will release the socket in case we sleep here. 1858 */1859 release_sock(sk);
1860 /*1861 * NB: following must be mtu, because mss can be increased.1862 * mss is always <= mtu 1863 */1864 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1865 sk->inuse = 1;
1866 send_tmp = skb;
1867 }1868 else1869 {1870 /*1871 * We will release the socket in case we sleep here. 1872 */1873 release_sock(sk);
1874 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1875 sk->inuse = 1;
1876 }1877
1878 /*1879 * If we didn't get any memory, we need to sleep. 1880 */1881
1882 if (skb == NULL)
1883 {1884 sk->socket->flags |= SO_NOSPACE;
1885 if (nonblock)
1886 {1887 release_sock(sk);
1888 if (copied)
1889 return(copied);
1890 return(-EAGAIN);
1891 }1892
1893 /*1894 * FIXME: here is another race condition. 1895 */1896
1897 tmp = sk->wmem_alloc;
1898 release_sock(sk);
1899 cli();
1900 /*1901 * Again we will try to avoid it. 1902 */1903 if (tmp <= sk->wmem_alloc &&
1904 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1905 && sk->err == 0)
1906 {1907 sk->socket->flags &= ~SO_NOSPACE;
1908 interruptible_sleep_on(sk->sleep);
1909 if (current->signal & ~current->blocked)
1910 {1911 sti();
1912 if (copied)
1913 return(copied);
1914 return(-ERESTARTSYS);
1915 }1916 }1917 sk->inuse = 1;
1918 sti();
1919 continue;
1920 }1921
1922 skb->sk = sk;
1923 skb->free = 0;
1924 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1925
1926 /*1927 * FIXME: we need to optimize this.1928 * Perhaps some hints here would be good.1929 */1930
1931 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1932 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1933 if (tmp < 0 )
1934 {1935 sock_wfree(sk, skb);
1936 release_sock(sk);
1937 if (copied)
1938 return(copied);
1939 return(tmp);
1940 }1941 skb->dev = dev;
1942 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
1943 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1944 if (tmp < 0)
1945 {1946 sock_wfree(sk, skb);
1947 release_sock(sk);
1948 if (copied)
1949 return(copied);
1950 return(tmp);
1951 }1952
1953 if (flags & MSG_OOB)
1954 {1955 skb->h.th->urg = 1;
1956 skb->h.th->urg_ptr = ntohs(copy);
1957 }1958
1959 memcpy_fromfs(skb_put(skb,copy), from, copy);
1960
1961 from += copy;
1962 copied += copy;
1963 len -= copy;
1964 seglen -= copy;
1965 skb->free = 0;
1966 sk->write_seq += copy;
1967
1968 if (send_tmp != NULL && sk->packets_out)
1969 {1970 tcp_enqueue_partial(send_tmp, sk);
1971 continue;
1972 }1973 tcp_send_skb(sk, skb);
1974 }1975 }1976 sk->err = 0;
1977
1978 /*1979 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1980 * interactive fast network servers. It's meant to be on and1981 * it really improves the throughput though not the echo time1982 * on my slow slip link - Alan1983 */1984
1985 /*1986 * Avoid possible race on send_tmp - c/o Johannes Stille 1987 */1988
1989 if(sk->partial && ((!sk->packets_out)
1990 /* If not nagling we can send on the before case too.. */1991 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1992 ))
1993 tcp_send_partial(sk);
1994
1995 release_sock(sk);
1996 return(copied);
1997 }1998
1999 staticinttcp_sendto(structsock *sk, constunsignedchar *ubuf, intsize, intnoblock, unsignedflags,
/* */2000 structsockaddr_in *sin, intaddr_len)
2001 {2002 structioveciov;
2003 structmsghdrmsg;
2004
2005 iov.iov_base = (void *)ubuf;
2006 iov.iov_len = size;
2007
2008 msg.msg_name = (void *)sin;
2009 msg.msg_namelen = addr_len;
2010 msg.msg_accrights = NULL;
2011 msg.msg_iov = &iov;
2012 msg.msg_iovlen = 1;
2013
2014 returntcp_sendmsg(sk, &msg, size, noblock, flags);
2015 }2016
2017 staticinttcp_write(structsock *sk, constunsignedchar *ubuf, intsize, intnoblock, unsignedflags)
/* */2018 {2019 returntcp_sendto(sk,ubuf,size,noblock,flags,NULL,0);
2020 }2021
2022
2023 /*2024 * Send an ack if one is backlogged at this point. Ought to merge2025 * this with tcp_send_ack().2026 */2027
2028 staticvoidtcp_read_wakeup(structsock *sk)
/* */2029 {2030 inttmp;
2031 structdevice *dev = NULL;
2032 structtcphdr *t1;
2033 structsk_buff *buff;
2034
2035 if (!sk->ack_backlog)
2036 return;
2037
2038 /*2039 * If we're closed, don't send an ack, or we'll get a RST2040 * from the closed destination.2041 */2042 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2043 return;
2044
2045 /*2046 * FIXME: we need to put code here to prevent this routine from2047 * being called. Being called once in a while is ok, so only check2048 * if this is the second time in a row.2049 */2050
2051 /*2052 * We need to grab some memory, and put together an ack,2053 * and then put it into the queue to be sent.2054 */2055
2056 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2057 if (buff == NULL)
2058 {2059 /* Try again real soon. */2060 reset_xmit_timer(sk, TIME_WRITE, HZ);
2061 return;
2062 }2063
2064 buff->sk = sk;
2065 buff->localroute = sk->localroute;
2066
2067 /*2068 * Put in the IP header and routing stuff. 2069 */2070
2071 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2072 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2073 if (tmp < 0)
2074 {2075 buff->free = 1;
2076 sock_wfree(sk, buff);
2077 return;
2078 }2079
2080 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2081
2082 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2083 t1->seq = htonl(sk->sent_seq);
2084 t1->ack = 1;
2085 t1->res1 = 0;
2086 t1->res2 = 0;
2087 t1->rst = 0;
2088 t1->urg = 0;
2089 t1->syn = 0;
2090 t1->psh = 0;
2091 sk->ack_backlog = 0;
2092 sk->bytes_rcv = 0;
2093 sk->window = tcp_select_window(sk);
2094 t1->window = ntohs(sk->window);
2095 t1->ack_seq = ntohl(sk->acked_seq);
2096 t1->doff = sizeof(*t1)/4;
2097 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2098 sk->prot->queue_xmit(sk, dev, buff, 1);
2099 tcp_statistics.TcpOutSegs++;
2100 }2101
2102
2103 /*2104 * FIXME:2105 * This routine frees used buffers.2106 * It should consider sending an ACK to let the2107 * other end know we now have a bigger window.2108 */2109
2110 staticvoidcleanup_rbuf(structsock *sk)
/* */2111 {2112 unsignedlongflags;
2113 unsignedlongleft;
2114 structsk_buff *skb;
2115 unsignedlongrspace;
2116
2117 if(sk->debug)
2118 printk("cleaning rbuf for sk=%p\n", sk);
2119
2120 save_flags(flags);
2121 cli();
2122
2123 left = sock_rspace(sk);
2124
2125 /*2126 * We have to loop through all the buffer headers,2127 * and try to free up all the space we can.2128 */2129
2130 while((skb=skb_peek(&sk->receive_queue)) != NULL)
2131 {2132 if (!skb->used || skb->users)
2133 break;
2134 skb_unlink(skb);
2135 skb->sk = sk;
2136 kfree_skb(skb, FREE_READ);
2137 }2138
2139 restore_flags(flags);
2140
2141 /*2142 * FIXME:2143 * At this point we should send an ack if the difference2144 * in the window, and the amount of space is bigger than2145 * TCP_WINDOW_DIFF.2146 */2147
2148 if(sk->debug)
2149 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2150 left);
2151 if ((rspace=sock_rspace(sk)) != left)
2152 {2153 /*2154 * This area has caused the most trouble. The current strategy2155 * is to simply do nothing if the other end has room to send at2156 * least 3 full packets, because the ack from those will auto-2157 * matically update the window. If the other end doesn't think2158 * we have much space left, but we have room for at least 1 more2159 * complete packet than it thinks we do, we will send an ack2160 * immediately. Otherwise we will wait up to .5 seconds in case2161 * the user reads some more.2162 */2163 sk->ack_backlog++;
2164 /*2165 * It's unclear whether to use sk->mtu or sk->mss here. They differ only2166 * if the other end is offering a window smaller than the agreed on MSS2167 * (called sk->mtu here). In theory there's no connection between send2168 * and receive, and so no reason to think that they're going to send2169 * small packets. For the moment I'm using the hack of reducing the mss2170 * only on the send side, so I'm putting mtu here.2171 */2172
2173 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2174 {2175 /* Send an ack right now. */2176 tcp_read_wakeup(sk);
2177 }2178 else2179 {2180 /* Force it to send an ack soon. */2181 intwas_active = del_timer(&sk->retransmit_timer);
2182 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2183 {2184 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2185 }2186 else2187 add_timer(&sk->retransmit_timer);
2188 }2189 }2190 }2191
2192
2193 /*2194 * Handle reading urgent data. BSD has very simple semantics for2195 * this, no blocking and very strange errors 8)2196 */2197
2198 staticinttcp_recv_urg(structsock * sk, intnonblock,
/* */2199 structmsghdr *msg, intlen, intflags, int *addr_len)
2200 {2201 /*2202 * No URG data to read2203 */2204 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2205 return -EINVAL; /* Yes this is right ! */2206
2207 if (sk->err)
2208 {2209 inttmp = -sk->err;
2210 sk->err = 0;
2211 returntmp;
2212 }2213
2214 if (sk->state == TCP_CLOSE || sk->done)
2215 {2216 if (!sk->done)
2217 {2218 sk->done = 1;
2219 return 0;
2220 }2221 return -ENOTCONN;
2222 }2223
2224 if (sk->shutdown & RCV_SHUTDOWN)
2225 {2226 sk->done = 1;
2227 return 0;
2228 }2229 sk->inuse = 1;
2230 if (sk->urg_data & URG_VALID)
2231 {2232 charc = sk->urg_data;
2233 if (!(flags & MSG_PEEK))
2234 sk->urg_data = URG_READ;
2235 memcpy_toiovec(msg->msg_iov, &c, 1);
2236 if(msg->msg_name)
2237 {2238 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2239 sin->sin_family=AF_INET;
2240 sin->sin_addr.s_addr=sk->daddr;
2241 sin->sin_port=sk->dummy_th.dest;
2242 }2243 if(addr_len)
2244 *addr_len=sizeof(structsockaddr_in);
2245 release_sock(sk);
2246 return 1;
2247 }2248 release_sock(sk);
2249
2250 /*2251 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2252 * the available implementations agree in this case:2253 * this call should never block, independent of the2254 * blocking state of the socket.2255 * Mike <pall@rz.uni-karlsruhe.de>2256 */2257 return -EAGAIN;
2258 }2259
2260
2261 /*2262 * This routine copies from a sock struct into the user buffer. 2263 */2264
2265 staticinttcp_recvmsg(structsock *sk, structmsghdr *msg,
/* */2266 intlen, intnonblock, intflags, int *addr_len)
2267 {2268 structwait_queuewait = {current, NULL};
2269 intcopied = 0;
2270 u32peek_seq;
2271 volatileu32 *seq; /* So gcc doesn't overoptimise */2272 unsignedlongused;
2273
2274 /* 2275 * This error should be checked. 2276 */2277
2278 if (sk->state == TCP_LISTEN)
2279 return -ENOTCONN;
2280
2281 /*2282 * Urgent data needs to be handled specially. 2283 */2284
2285 if (flags & MSG_OOB)
2286 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2287
2288 /*2289 * Copying sequence to update. This is volatile to handle2290 * the multi-reader case neatly (memcpy_to/fromfs might be 2291 * inline and thus not flush cached variables otherwise).2292 */2293
2294 peek_seq = sk->copied_seq;
2295 seq = &sk->copied_seq;
2296 if (flags & MSG_PEEK)
2297 seq = &peek_seq;
2298
2299 add_wait_queue(sk->sleep, &wait);
2300 sk->inuse = 1;
2301 while (len > 0)
2302 {2303 structsk_buff * skb;
2304 u32offset;
2305
2306 /*2307 * Are we at urgent data? Stop if we have read anything.2308 */2309
2310 if (copied && sk->urg_data && sk->urg_seq == *seq)
2311 break;
2312
2313 /*2314 * Next get a buffer.2315 */2316
2317 current->state = TASK_INTERRUPTIBLE;
2318
2319 skb = skb_peek(&sk->receive_queue);
2320 do2321 {2322 if (!skb)
2323 break;
2324 if (before(*seq, skb->h.th->seq))
2325 break;
2326 offset = *seq - skb->h.th->seq;
2327 if (skb->h.th->syn)
2328 offset--;
2329 if (offset < skb->len)
2330 gotofound_ok_skb;
2331 if (skb->h.th->fin)
2332 gotofound_fin_ok;
2333 if (!(flags & MSG_PEEK))
2334 skb->used = 1;
2335 skb = skb->next;
2336 }2337 while (skb != (structsk_buff *)&sk->receive_queue);
2338
2339 if (copied)
2340 break;
2341
2342 if (sk->err)
2343 {2344 copied = -sk->err;
2345 sk->err = 0;
2346 break;
2347 }2348
2349 if (sk->state == TCP_CLOSE)
2350 {2351 if (!sk->done)
2352 {2353 sk->done = 1;
2354 break;
2355 }2356 copied = -ENOTCONN;
2357 break;
2358 }2359
2360 if (sk->shutdown & RCV_SHUTDOWN)
2361 {2362 sk->done = 1;
2363 break;
2364 }2365
2366 if (nonblock)
2367 {2368 copied = -EAGAIN;
2369 break;
2370 }2371
2372 cleanup_rbuf(sk);
2373 release_sock(sk);
2374 sk->socket->flags |= SO_WAITDATA;
2375 schedule();
2376 sk->socket->flags &= ~SO_WAITDATA;
2377 sk->inuse = 1;
2378
2379 if (current->signal & ~current->blocked)
2380 {2381 copied = -ERESTARTSYS;
2382 break;
2383 }2384 continue;
2385
2386 found_ok_skb:
2387 /*2388 * Lock the buffer. We can be fairly relaxed as2389 * an interrupt will never steal a buffer we are 2390 * using unless I've missed something serious in2391 * tcp_data.2392 */2393
2394 skb->users++;
2395
2396 /*2397 * Ok so how much can we use ? 2398 */2399
2400 used = skb->len - offset;
2401 if (len < used)
2402 used = len;
2403 /*2404 * Do we have urgent data here? 2405 */2406
2407 if (sk->urg_data)
2408 {2409 u32urg_offset = sk->urg_seq - *seq;
2410 if (urg_offset < used)
2411 {2412 if (!urg_offset)
2413 {2414 if (!sk->urginline)
2415 {2416 ++*seq;
2417 offset++;
2418 used--;
2419 }2420 }2421 else2422 used = urg_offset;
2423 }2424 }2425
2426 /*2427 * Copy it - We _MUST_ update *seq first so that we2428 * don't ever double read when we have dual readers2429 */2430
2431 *seq += used;
2432
2433 /*2434 * This memcpy_tofs can sleep. If it sleeps and we2435 * do a second read it relies on the skb->users to avoid2436 * a crash when cleanup_rbuf() gets called.2437 */2438
2439 memcpy_toiovec(msg->msg_iov,((unsignedchar *)skb->h.th) +
2440 skb->h.th->doff*4 + offset, used);
2441 copied += used;
2442 len -= used;
2443
2444 /*2445 * We now will not sleep again until we are finished2446 * with skb. Sorry if you are doing the SMP port2447 * but you'll just have to fix it neatly ;)2448 */2449
2450 skb->users --;
2451
2452 if (after(sk->copied_seq,sk->urg_seq))
2453 sk->urg_data = 0;
2454 if (used + offset < skb->len)
2455 continue;
2456
2457 /*2458 * Process the FIN.2459 */2460
2461 if (skb->h.th->fin)
2462 gotofound_fin_ok;
2463 if (flags & MSG_PEEK)
2464 continue;
2465 skb->used = 1;
2466 continue;
2467
2468 found_fin_ok:
2469 ++*seq;
2470 if (flags & MSG_PEEK)
2471 break;
2472
2473 /*2474 * All is done2475 */2476
2477 skb->used = 1;
2478 sk->shutdown |= RCV_SHUTDOWN;
2479 break;
2480
2481 }2482
2483 if(copied>0 && msg->msg_name)
2484 {2485 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2486 sin->sin_family=AF_INET;
2487 sin->sin_addr.s_addr=sk->daddr;
2488 sin->sin_port=sk->dummy_th.dest;
2489 }2490 if(addr_len)
2491 *addr_len=sizeof(structsockaddr_in);
2492
2493 remove_wait_queue(sk->sleep, &wait);
2494 current->state = TASK_RUNNING;
2495
2496 /* Clean up data we have read: This will do ACK frames */2497 cleanup_rbuf(sk);
2498 release_sock(sk);
2499 returncopied;
2500 }2501
2502
2503 staticinttcp_recvfrom(structsock *sk, unsignedchar *ubuf, intsize, intnoblock, unsignedflags,
/* */2504 structsockaddr_in *sa, int *addr_len)
2505 {2506 structioveciov;
2507 structmsghdrmsg;
2508
2509 iov.iov_base = (void *)ubuf;
2510 iov.iov_len = size;
2511
2512 msg.msg_name = (void *)sa;
2513 msg.msg_namelen = 0;
2514 if (addr_len)
2515 msg.msg_namelen = *addr_len;
2516 msg.msg_accrights = NULL;
2517 msg.msg_iov = &iov;
2518 msg.msg_iovlen = 1;
2519
2520 returntcp_recvmsg(sk, &msg, size, noblock, flags, addr_len);
2521 }2522
2523 inttcp_read(structsock *sk, unsignedchar *buff, intlen, intnoblock,
/* */2524 unsignedflags)
2525 {2526 return(tcp_recvfrom(sk, buff, len, noblock, flags, NULL, NULL));
2527 }2528
2529
2530 /*2531 * State processing on a close. This implements the state shift for2532 * sending our FIN frame. Note that we only send a FIN for some 2533 * states. A shutdown() may have already sent the FIN, or we may be2534 * closed.2535 */2536
2537 staticinttcp_close_state(structsock *sk, intdead)
/* */2538 {2539 intns=TCP_CLOSE;
2540 intsend_fin=0;
2541 switch(sk->state)
2542 {2543 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2544 break;
2545 caseTCP_SYN_RECV:
2546 caseTCP_ESTABLISHED: /* Closedown begin */2547 ns=TCP_FIN_WAIT1;
2548 send_fin=1;
2549 break;
2550 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2551 caseTCP_FIN_WAIT2:
2552 caseTCP_CLOSING:
2553 ns=sk->state;
2554 break;
2555 caseTCP_CLOSE:
2556 caseTCP_LISTEN:
2557 break;
2558 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2559 wait only for the ACK */2560 ns=TCP_LAST_ACK;
2561 send_fin=1;
2562 }2563
2564 tcp_set_state(sk,ns);
2565
2566 /*2567 * This is a (useful) BSD violating of the RFC. There is a2568 * problem with TCP as specified in that the other end could2569 * keep a socket open forever with no application left this end.2570 * We use a 3 minute timeout (about the same as BSD) then kill2571 * our end. If they send after that then tough - BUT: long enough2572 * that we won't make the old 4*rto = almost no time - whoops2573 * reset mistake.2574 */2575 if(dead && ns==TCP_FIN_WAIT2)
2576 {2577 inttimer_active=del_timer(&sk->timer);
2578 if(timer_active)
2579 add_timer(&sk->timer);
2580 else2581 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2582 }2583
2584 returnsend_fin;
2585 }2586
2587 /*2588 * Send a fin.2589 */2590
2591 staticvoidtcp_send_fin(structsock *sk)
/* */2592 {2593 structproto *prot =(structproto *)sk->prot;
2594 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2595 structtcphdr *t1;
2596 structsk_buff *buff;
2597 structdevice *dev=NULL;
2598 inttmp;
2599
2600 release_sock(sk); /* in case the malloc sleeps. */2601
2602 buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2603 sk->inuse = 1;
2604
2605 if (buff == NULL)
2606 {2607 /* This is a disaster if it occurs */2608 printk("tcp_send_fin: Impossible malloc failure");
2609 return;
2610 }2611
2612 /*2613 * Administrivia2614 */2615
2616 buff->sk = sk;
2617 buff->localroute = sk->localroute;
2618
2619 /*2620 * Put in the IP header and routing stuff. 2621 */2622
2623 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2624 IPPROTO_TCP, sk->opt,
2625 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2626 if (tmp < 0)
2627 {2628 intt;
2629 /*2630 * Finish anyway, treat this as a send that got lost. 2631 * (Not good).2632 */2633
2634 buff->free = 1;
2635 sock_wfree(sk,buff);
2636 sk->write_seq++;
2637 t=del_timer(&sk->timer);
2638 if(t)
2639 add_timer(&sk->timer);
2640 else2641 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2642 return;
2643 }2644
2645 /*2646 * We ought to check if the end of the queue is a buffer and2647 * if so simply add the fin to that buffer, not send it ahead.2648 */2649
2650 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2651 buff->dev = dev;
2652 memcpy(t1, th, sizeof(*t1));
2653 t1->seq = ntohl(sk->write_seq);
2654 sk->write_seq++;
2655 buff->h.seq = sk->write_seq;
2656 t1->ack = 1;
2657 t1->ack_seq = ntohl(sk->acked_seq);
2658 t1->window = ntohs(sk->window=tcp_select_window(sk));
2659 t1->fin = 1;
2660 t1->rst = 0;
2661 t1->doff = sizeof(*t1)/4;
2662 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2663
2664 /*2665 * If there is data in the write queue, the fin must be appended to2666 * the write queue.2667 */2668
2669 if (skb_peek(&sk->write_queue) != NULL)
2670 {2671 buff->free = 0;
2672 if (buff->next != NULL)
2673 {2674 printk("tcp_send_fin: next != NULL\n");
2675 skb_unlink(buff);
2676 }2677 skb_queue_tail(&sk->write_queue, buff);
2678 }2679 else2680 {2681 sk->sent_seq = sk->write_seq;
2682 sk->prot->queue_xmit(sk, dev, buff, 0);
2683 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2684 }2685 }2686
2687 /*2688 * Shutdown the sending side of a connection. Much like close except2689 * that we don't receive shut down or set sk->dead=1.2690 */2691
2692 voidtcp_shutdown(structsock *sk, inthow)
/* */2693 {2694 /*2695 * We need to grab some memory, and put together a FIN,2696 * and then put it into the queue to be sent.2697 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2698 */2699
2700 if (!(how & SEND_SHUTDOWN))
2701 return;
2702
2703 /*2704 * If we've already sent a FIN, or it's a closed state2705 */2706
2707 if (sk->state == TCP_FIN_WAIT1 ||
2708 sk->state == TCP_FIN_WAIT2 ||
2709 sk->state == TCP_CLOSING ||
2710 sk->state == TCP_LAST_ACK ||
2711 sk->state == TCP_TIME_WAIT ||
2712 sk->state == TCP_CLOSE ||
2713 sk->state == TCP_LISTEN2714 )
2715 {2716 return;
2717 }2718 sk->inuse = 1;
2719
2720 /*2721 * flag that the sender has shutdown2722 */2723
2724 sk->shutdown |= SEND_SHUTDOWN;
2725
2726 /*2727 * Clear out any half completed packets. 2728 */2729
2730 if (sk->partial)
2731 tcp_send_partial(sk);
2732
2733 /*2734 * FIN if needed2735 */2736
2737 if(tcp_close_state(sk,0))
2738 tcp_send_fin(sk);
2739
2740 release_sock(sk);
2741 }2742
2743 /*2744 * This routine will send an RST to the other tcp. 2745 */2746
2747 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2748 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2749 {2750 structsk_buff *buff;
2751 structtcphdr *t1;
2752 inttmp;
2753 structdevice *ndev=NULL;
2754
2755 /*2756 * Cannot reset a reset (Think about it).2757 */2758
2759 if(th->rst)
2760 return;
2761
2762 /*2763 * We need to grab some memory, and put together an RST,2764 * and then put it into the queue to be sent.2765 */2766
2767 buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2768 if (buff == NULL)
2769 return;
2770
2771 buff->sk = NULL;
2772 buff->dev = dev;
2773 buff->localroute = 0;
2774
2775 /*2776 * Put in the IP header and routing stuff. 2777 */2778
2779 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2780 sizeof(structtcphdr),tos,ttl);
2781 if (tmp < 0)
2782 {2783 buff->free = 1;
2784 sock_wfree(NULL, buff);
2785 return;
2786 }2787
2788 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2789 memcpy(t1, th, sizeof(*t1));
2790
2791 /*2792 * Swap the send and the receive. 2793 */2794
2795 t1->dest = th->source;
2796 t1->source = th->dest;
2797 t1->rst = 1;
2798 t1->window = 0;
2799
2800 if(th->ack)
2801 {2802 t1->ack = 0;
2803 t1->seq = th->ack_seq;
2804 t1->ack_seq = 0;
2805 }2806 else2807 {2808 t1->ack = 1;
2809 if(!th->syn)
2810 t1->ack_seq=htonl(th->seq);
2811 else2812 t1->ack_seq=htonl(th->seq+1);
2813 t1->seq=0;
2814 }2815
2816 t1->syn = 0;
2817 t1->urg = 0;
2818 t1->fin = 0;
2819 t1->psh = 0;
2820 t1->doff = sizeof(*t1)/4;
2821 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2822 prot->queue_xmit(NULL, ndev, buff, 1);
2823 tcp_statistics.TcpOutSegs++;
2824 }2825
2826
2827 /*2828 * Look for tcp options. Parses everything but only knows about MSS.2829 * This routine is always called with the packet containing the SYN.2830 * However it may also be called with the ack to the SYN. So you2831 * can't assume this is always the SYN. It's always called after2832 * we have set up sk->mtu to our own MTU.2833 *2834 * We need at minimum to add PAWS support here. Possibly large windows2835 * as Linux gets deployed on 100Mb/sec networks.2836 */2837
2838 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2839 {2840 unsignedchar *ptr;
2841 intlength=(th->doff*4)-sizeof(structtcphdr);
2842 intmss_seen = 0;
2843
2844 ptr = (unsignedchar *)(th + 1);
2845
2846 while(length>0)
2847 {2848 intopcode=*ptr++;
2849 intopsize=*ptr++;
2850 switch(opcode)
2851 {2852 caseTCPOPT_EOL:
2853 return;
2854 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2855 length--;
2856 ptr--; /* the opsize=*ptr++ above was a mistake */2857 continue;
2858
2859 default:
2860 if(opsize<=2) /* Avoid silly options looping forever */2861 return;
2862 switch(opcode)
2863 {2864 caseTCPOPT_MSS:
2865 if(opsize==4 && th->syn)
2866 {2867 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2868 mss_seen = 1;
2869 }2870 break;
2871 /* Add other options here as people feel the urge to implement stuff like large windows */2872 }2873 ptr+=opsize-2;
2874 length-=opsize;
2875 }2876 }2877 if (th->syn)
2878 {2879 if (! mss_seen)
2880 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2881 }2882 #ifdefCONFIG_INET_PCTCP2883 sk->mss = min(sk->max_window >> 1, sk->mtu);
2884 #else2885 sk->mss = min(sk->max_window, sk->mtu);
2886 #endif2887 }2888
2889 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2890 {2891 dst = ntohl(dst);
2892 if (IN_CLASSA(dst))
2893 returnhtonl(IN_CLASSA_NET);
2894 if (IN_CLASSB(dst))
2895 returnhtonl(IN_CLASSB_NET);
2896 returnhtonl(IN_CLASSC_NET);
2897 }2898
2899 /*2900 * Default sequence number picking algorithm.2901 * As close as possible to RFC 793, which2902 * suggests using a 250kHz clock.2903 * Further reading shows this assumes 2MB/s networks.2904 * For 10MB/s ethernet, a 1MHz clock is appropriate.2905 * That's funny, Linux has one built in! Use it!2906 */2907
2908 externinlineu32tcp_init_seq(void)
/* */2909 {2910 structtimevaltv;
2911 do_gettimeofday(&tv);
2912 returntv.tv_usec+tv.tv_sec*1000000;
2913 }2914
2915 /*2916 * This routine handles a connection request.2917 * It should make sure we haven't already responded.2918 * Because of the way BSD works, we have to send a syn/ack now.2919 * This also means it will be harder to close a socket which is2920 * listening.2921 */2922
2923 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2924 unsignedlongdaddr, unsignedlongsaddr,
2925 structoptions *opt, structdevice *dev, u32seq)
2926 {2927 structsk_buff *buff;
2928 structtcphdr *t1;
2929 unsignedchar *ptr;
2930 structsock *newsk;
2931 structtcphdr *th;
2932 structdevice *ndev=NULL;
2933 inttmp;
2934 structrtable *rt;
2935
2936 th = skb->h.th;
2937
2938 /* If the socket is dead, don't accept the connection. */2939 if (!sk->dead)
2940 {2941 sk->data_ready(sk,0);
2942 }2943 else2944 {2945 if(sk->debug)
2946 printk("Reset on %p: Connect on dead socket.\n",sk);
2947 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2948 tcp_statistics.TcpAttemptFails++;
2949 kfree_skb(skb, FREE_READ);
2950 return;
2951 }2952
2953 /*2954 * Make sure we can accept more. This will prevent a2955 * flurry of syns from eating up all our memory.2956 */2957
2958 if (sk->ack_backlog >= sk->max_ack_backlog)
2959 {2960 tcp_statistics.TcpAttemptFails++;
2961 kfree_skb(skb, FREE_READ);
2962 return;
2963 }2964
2965 /*2966 * We need to build a new sock struct.2967 * It is sort of bad to have a socket without an inode attached2968 * to it, but the wake_up's will just wake up the listening socket,2969 * and if the listening socket is destroyed before this is taken2970 * off of the queue, this will take care of it.2971 */2972
2973 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2974 if (newsk == NULL)
2975 {2976 /* just ignore the syn. It will get retransmitted. */2977 tcp_statistics.TcpAttemptFails++;
2978 kfree_skb(skb, FREE_READ);
2979 return;
2980 }2981
2982 memcpy(newsk, sk, sizeof(*newsk));
2983 newsk->opt = NULL;
2984 if (opt && opt->optlen) {2985 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
2986 if (!sk->opt) {2987 kfree_s(newsk, sizeof(structsock));
2988 tcp_statistics.TcpAttemptFails++;
2989 kfree_skb(skb, FREE_READ);
2990 return;
2991 }2992 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {2993 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
2994 kfree_s(newsk, sizeof(structsock));
2995 tcp_statistics.TcpAttemptFails++;
2996 kfree_skb(skb, FREE_READ);
2997 return;
2998 }2999 }3000 skb_queue_head_init(&newsk->write_queue);
3001 skb_queue_head_init(&newsk->receive_queue);
3002 newsk->send_head = NULL;
3003 newsk->send_tail = NULL;
3004 skb_queue_head_init(&newsk->back_log);
3005 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/3006 newsk->rto = TCP_TIMEOUT_INIT;
3007 newsk->mdev = 0;
3008 newsk->max_window = 0;
3009 newsk->cong_window = 1;
3010 newsk->cong_count = 0;
3011 newsk->ssthresh = 0;
3012 newsk->backoff = 0;
3013 newsk->blog = 0;
3014 newsk->intr = 0;
3015 newsk->proc = 0;
3016 newsk->done = 0;
3017 newsk->partial = NULL;
3018 newsk->pair = NULL;
3019 newsk->wmem_alloc = 0;
3020 newsk->rmem_alloc = 0;
3021 newsk->localroute = sk->localroute;
3022
3023 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3024
3025 newsk->err = 0;
3026 newsk->shutdown = 0;
3027 newsk->ack_backlog = 0;
3028 newsk->acked_seq = skb->h.th->seq+1;
3029 newsk->copied_seq = skb->h.th->seq+1;
3030 newsk->fin_seq = skb->h.th->seq;
3031 newsk->state = TCP_SYN_RECV;
3032 newsk->timeout = 0;
3033 newsk->ip_xmit_timeout = 0;
3034 newsk->write_seq = seq;
3035 newsk->window_seq = newsk->write_seq;
3036 newsk->rcv_ack_seq = newsk->write_seq;
3037 newsk->urg_data = 0;
3038 newsk->retransmits = 0;
3039 newsk->linger=0;
3040 newsk->destroy = 0;
3041 init_timer(&newsk->timer);
3042 newsk->timer.data = (unsignedlong)newsk;
3043 newsk->timer.function = &net_timer;
3044 init_timer(&newsk->retransmit_timer);
3045 newsk->retransmit_timer.data = (unsignedlong)newsk;
3046 newsk->retransmit_timer.function=&retransmit_timer;
3047 newsk->dummy_th.source = skb->h.th->dest;
3048 newsk->dummy_th.dest = skb->h.th->source;
3049
3050 /*3051 * Swap these two, they are from our point of view. 3052 */3053
3054 newsk->daddr = saddr;
3055 newsk->saddr = daddr;
3056 newsk->rcv_saddr = daddr;
3057
3058 put_sock(newsk->num,newsk);
3059 newsk->dummy_th.res1 = 0;
3060 newsk->dummy_th.doff = 6;
3061 newsk->dummy_th.fin = 0;
3062 newsk->dummy_th.syn = 0;
3063 newsk->dummy_th.rst = 0;
3064 newsk->dummy_th.psh = 0;
3065 newsk->dummy_th.ack = 0;
3066 newsk->dummy_th.urg = 0;
3067 newsk->dummy_th.res2 = 0;
3068 newsk->acked_seq = skb->h.th->seq + 1;
3069 newsk->copied_seq = skb->h.th->seq + 1;
3070 newsk->socket = NULL;
3071
3072 /*3073 * Grab the ttl and tos values and use them 3074 */3075
3076 newsk->ip_ttl=sk->ip_ttl;
3077 newsk->ip_tos=skb->ip_hdr->tos;
3078
3079 /*3080 * Use 512 or whatever user asked for 3081 */3082
3083 /*3084 * Note use of sk->user_mss, since user has no direct access to newsk 3085 */3086
3087 rt=ip_rt_route(saddr, NULL,NULL);
3088
3089 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3090 newsk->window_clamp = rt->rt_window;
3091 else3092 newsk->window_clamp = 0;
3093
3094 if (sk->user_mss)
3095 newsk->mtu = sk->user_mss;
3096 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
3097 newsk->mtu = rt->rt_mss - sizeof(structiphdr) - sizeof(structtcphdr);
3098 else3099 {3100 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */3101 if ((saddr ^ daddr) & default_mask(saddr))
3102 #else3103 if ((saddr ^ daddr) & dev->pa_mask)
3104 #endif3105 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
3106 else3107 newsk->mtu = MAX_WINDOW;
3108 }3109
3110 /*3111 * But not bigger than device MTU 3112 */3113
3114 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
3115
3116 /*3117 * This will min with what arrived in the packet 3118 */3119
3120 tcp_options(newsk,skb->h.th);
3121
3122 tcp_cache_zap();
3123
3124 buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3125 if (buff == NULL)
3126 {3127 sk->err = ENOMEM;
3128 newsk->dead = 1;
3129 newsk->state = TCP_CLOSE;
3130 /* And this will destroy it */3131 release_sock(newsk);
3132 kfree_skb(skb, FREE_READ);
3133 tcp_statistics.TcpAttemptFails++;
3134 return;
3135 }3136
3137 buff->sk = newsk;
3138 buff->localroute = newsk->localroute;
3139
3140 /*3141 * Put in the IP header and routing stuff. 3142 */3143
3144 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3145 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3146
3147 /*3148 * Something went wrong. 3149 */3150
3151 if (tmp < 0)
3152 {3153 sk->err = tmp;
3154 buff->free = 1;
3155 kfree_skb(buff,FREE_WRITE);
3156 newsk->dead = 1;
3157 newsk->state = TCP_CLOSE;
3158 release_sock(newsk);
3159 skb->sk = sk;
3160 kfree_skb(skb, FREE_READ);
3161 tcp_statistics.TcpAttemptFails++;
3162 return;
3163 }3164
3165 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
3166
3167 memcpy(t1, skb->h.th, sizeof(*t1));
3168 buff->h.seq = newsk->write_seq;
3169 /*3170 * Swap the send and the receive. 3171 */3172 t1->dest = skb->h.th->source;
3173 t1->source = newsk->dummy_th.source;
3174 t1->seq = ntohl(newsk->write_seq++);
3175 t1->ack = 1;
3176 newsk->window = tcp_select_window(newsk);
3177 newsk->sent_seq = newsk->write_seq;
3178 t1->window = ntohs(newsk->window);
3179 t1->res1 = 0;
3180 t1->res2 = 0;
3181 t1->rst = 0;
3182 t1->urg = 0;
3183 t1->psh = 0;
3184 t1->syn = 1;
3185 t1->ack_seq = ntohl(skb->h.th->seq+1);
3186 t1->doff = sizeof(*t1)/4+1;
3187 ptr = skb_put(buff,4);
3188 ptr[0] = 2;
3189 ptr[1] = 4;
3190 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3191 ptr[3] =(newsk->mtu) & 0xff;
3192
3193 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3194 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3195 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3196 skb->sk = newsk;
3197
3198 /*3199 * Charge the sock_buff to newsk. 3200 */3201
3202 sk->rmem_alloc -= skb->truesize;
3203 newsk->rmem_alloc += skb->truesize;
3204
3205 skb_queue_tail(&sk->receive_queue,skb);
3206 sk->ack_backlog++;
3207 release_sock(newsk);
3208 tcp_statistics.TcpOutSegs++;
3209 }3210
3211
3212 staticvoidtcp_close(structsock *sk, inttimeout)
/* */3213 {3214 /*3215 * We need to grab some memory, and put together a FIN, 3216 * and then put it into the queue to be sent.3217 */3218
3219 sk->inuse = 1;
3220
3221 if(th_cache_sk==sk)
3222 tcp_cache_zap();
3223 if(sk->state == TCP_LISTEN)
3224 {3225 /* Special case */3226 tcp_set_state(sk, TCP_CLOSE);
3227 tcp_close_pending(sk);
3228 release_sock(sk);
3229 return;
3230 }3231
3232 sk->keepopen = 1;
3233 sk->shutdown = SHUTDOWN_MASK;
3234
3235 if (!sk->dead)
3236 sk->state_change(sk);
3237
3238 if (timeout == 0)
3239 {3240 structsk_buff *skb;
3241
3242 /*3243 * We need to flush the recv. buffs. We do this only on the3244 * descriptor close, not protocol-sourced closes, because the3245 * reader process may not have drained the data yet!3246 */3247
3248 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3249 kfree_skb(skb, FREE_READ);
3250 /*3251 * Get rid off any half-completed packets. 3252 */3253
3254 if (sk->partial)
3255 tcp_send_partial(sk);
3256 }3257
3258
3259 /*3260 * Timeout is not the same thing - however the code likes3261 * to send both the same way (sigh).3262 */3263
3264 if(timeout)
3265 {3266 tcp_set_state(sk, TCP_CLOSE); /* Dead */3267 }3268 else3269 {3270 if(tcp_close_state(sk,1)==1)
3271 {3272 tcp_send_fin(sk);
3273 }3274 }3275 release_sock(sk);
3276 }3277
3278
3279 /*3280 * This routine takes stuff off of the write queue,3281 * and puts it in the xmit queue. This happens as incoming acks3282 * open up the remote window for us.3283 */3284
3285 staticvoidtcp_write_xmit(structsock *sk)
/* */3286 {3287 structsk_buff *skb;
3288
3289 /*3290 * The bytes will have to remain here. In time closedown will3291 * empty the write queue and all will be happy 3292 */3293
3294 if(sk->zapped)
3295 return;
3296
3297 /*3298 * Anything on the transmit queue that fits the window can3299 * be added providing we are not3300 *3301 * a) retransmitting (Nagle's rule)3302 * b) exceeding our congestion window.3303 */3304
3305 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3306 before(skb->h.seq, sk->window_seq + 1) &&
3307 (sk->retransmits == 0 ||
3308 sk->ip_xmit_timeout != TIME_WRITE ||
3309 before(skb->h.seq, sk->rcv_ack_seq + 1))
3310 && sk->packets_out < sk->cong_window)
3311 {3312 IS_SKB(skb);
3313 skb_unlink(skb);
3314
3315 /*3316 * See if we really need to send the packet. 3317 */3318
3319 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3320 {3321 /*3322 * This is acked data. We can discard it. This 3323 * cannot currently occur.3324 */3325
3326 sk->retransmits = 0;
3327 kfree_skb(skb, FREE_WRITE);
3328 if (!sk->dead)
3329 sk->write_space(sk);
3330 }3331 else3332 {3333 structtcphdr *th;
3334 structiphdr *iph;
3335 intsize;
3336 /*3337 * put in the ack seq and window at this point rather than earlier,3338 * in order to keep them monotonic. We really want to avoid taking3339 * back window allocations. That's legal, but RFC1122 says it's frowned on.3340 * Ack and window will in general have changed since this packet was put3341 * on the write queue.3342 */3343 iph = skb->ip_hdr;
3344 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3345 size = skb->len - (((unsignedchar *) th) - skb->data);
3346
3347 th->ack_seq = ntohl(sk->acked_seq);
3348 th->window = ntohs(tcp_select_window(sk));
3349
3350 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3351
3352 sk->sent_seq = skb->h.seq;
3353
3354 /*3355 * IP manages our queue for some crazy reason3356 */3357
3358 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3359
3360 /*3361 * Again we slide the timer wrongly3362 */3363
3364 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3365 }3366 }3367 }3368
3369
3370 /*3371 * This routine deals with incoming acks, but not outgoing ones.3372 */3373
3374 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3375 {3376 u32ack;
3377 intflag = 0;
3378
3379 /* 3380 * 1 - there was data in packet as well as ack or new data is sent or 3381 * in shutdown state3382 * 2 - data from retransmit queue was acked and removed3383 * 4 - window shrunk or data from retransmit queue was acked and removed3384 */3385
3386 if(sk->zapped)
3387 return(1); /* Dead, cant ack any more so why bother */3388
3389 /*3390 * Have we discovered a larger window3391 */3392
3393 ack = ntohl(th->ack_seq);
3394
3395 if (ntohs(th->window) > sk->max_window)
3396 {3397 sk->max_window = ntohs(th->window);
3398 #ifdefCONFIG_INET_PCTCP3399 /* Hack because we don't send partial packets to non SWS3400 handling hosts */3401 sk->mss = min(sk->max_window>>1, sk->mtu);
3402 #else3403 sk->mss = min(sk->max_window, sk->mtu);
3404 #endif3405 }3406
3407 /*3408 * We have dropped back to keepalive timeouts. Thus we have3409 * no retransmits pending.3410 */3411
3412 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3413 sk->retransmits = 0;
3414
3415 /*3416 * If the ack is newer than sent or older than previous acks3417 * then we can probably ignore it.3418 */3419
3420 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3421 {3422 if(sk->debug)
3423 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3424
3425 /*3426 * Keepalive processing.3427 */3428
3429 if (after(ack, sk->sent_seq))
3430 {3431 return(0);
3432 }3433
3434 /*3435 * Restart the keepalive timer.3436 */3437
3438 if (sk->keepopen)
3439 {3440 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3441 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3442 }3443 return(1);
3444 }3445
3446 /*3447 * If there is data set flag 13448 */3449
3450 if (len != th->doff*4)
3451 flag |= 1;
3452
3453 /*3454 * See if our window has been shrunk. 3455 */3456
3457 if (after(sk->window_seq, ack+ntohs(th->window)))
3458 {3459 /*3460 * We may need to move packets from the send queue3461 * to the write queue, if the window has been shrunk on us.3462 * The RFC says you are not allowed to shrink your window3463 * like this, but if the other end does, you must be able3464 * to deal with it.3465 */3466 structsk_buff *skb;
3467 structsk_buff *skb2;
3468 structsk_buff *wskb = NULL;
3469
3470 skb2 = sk->send_head;
3471 sk->send_head = NULL;
3472 sk->send_tail = NULL;
3473
3474 /*3475 * This is an artifact of a flawed concept. We want one3476 * queue and a smarter send routine when we send all.3477 */3478
3479 flag |= 4; /* Window changed */3480
3481 sk->window_seq = ack + ntohs(th->window);
3482 cli();
3483 while (skb2 != NULL)
3484 {3485 skb = skb2;
3486 skb2 = skb->link3;
3487 skb->link3 = NULL;
3488 if (after(skb->h.seq, sk->window_seq))
3489 {3490 if (sk->packets_out > 0)
3491 sk->packets_out--;
3492 /* We may need to remove this from the dev send list. */3493 if (skb->next != NULL)
3494 {3495 skb_unlink(skb);
3496 }3497 /* Now add it to the write_queue. */3498 if (wskb == NULL)
3499 skb_queue_head(&sk->write_queue,skb);
3500 else3501 skb_append(wskb,skb);
3502 wskb = skb;
3503 }3504 else3505 {3506 if (sk->send_head == NULL)
3507 {3508 sk->send_head = skb;
3509 sk->send_tail = skb;
3510 }3511 else3512 {3513 sk->send_tail->link3 = skb;
3514 sk->send_tail = skb;
3515 }3516 skb->link3 = NULL;
3517 }3518 }3519 sti();
3520 }3521
3522 /*3523 * Pipe has emptied3524 */3525
3526 if (sk->send_tail == NULL || sk->send_head == NULL)
3527 {3528 sk->send_head = NULL;
3529 sk->send_tail = NULL;
3530 sk->packets_out= 0;
3531 }3532
3533 /*3534 * Update the right hand window edge of the host3535 */3536
3537 sk->window_seq = ack + ntohs(th->window);
3538
3539 /*3540 * We don't want too many packets out there. 3541 */3542
3543 if (sk->ip_xmit_timeout == TIME_WRITE &&
3544 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3545 {3546 /* 3547 * This is Jacobson's slow start and congestion avoidance. 3548 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3549 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3550 * counter and increment it once every cwnd times. It's possible3551 * that this should be done only if sk->retransmits == 0. I'm3552 * interpreting "new data is acked" as including data that has3553 * been retransmitted but is just now being acked.3554 */3555 if (sk->cong_window < sk->ssthresh)
3556 /* 3557 * In "safe" area, increase3558 */3559 sk->cong_window++;
3560 else3561 {3562 /*3563 * In dangerous area, increase slowly. In theory this is3564 * sk->cong_window += 1 / sk->cong_window3565 */3566 if (sk->cong_count >= sk->cong_window)
3567 {3568 sk->cong_window++;
3569 sk->cong_count = 0;
3570 }3571 else3572 sk->cong_count++;
3573 }3574 }3575
3576 /*3577 * Remember the highest ack received.3578 */3579
3580 sk->rcv_ack_seq = ack;
3581
3582 /*3583 * If this ack opens up a zero window, clear backoff. It was3584 * being used to time the probes, and is probably far higher than3585 * it needs to be for normal retransmission.3586 */3587
3588 if (sk->ip_xmit_timeout == TIME_PROBE0)
3589 {3590 sk->retransmits = 0; /* Our probe was answered */3591
3592 /*3593 * Was it a usable window open ?3594 */3595
3596 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3597 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3598 {3599 sk->backoff = 0;
3600
3601 /*3602 * Recompute rto from rtt. this eliminates any backoff.3603 */3604
3605 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3606 if (sk->rto > 120*HZ)
3607 sk->rto = 120*HZ;
3608 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3609 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3610 .2 of a second is going to need huge windows (SIGH) */3611 sk->rto = 20;
3612 }3613 }3614
3615 /* 3616 * See if we can take anything off of the retransmit queue.3617 */3618
3619 while(sk->send_head != NULL)
3620 {3621 /* Check for a bug. */3622 if (sk->send_head->link3 &&
3623 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3624 printk("INET: tcp.c: *** bug send_list out of order.\n");
3625
3626 /*3627 * If our packet is before the ack sequence we can3628 * discard it as it's confirmed to have arrived the other end.3629 */3630
3631 if (before(sk->send_head->h.seq, ack+1))
3632 {3633 structsk_buff *oskb;
3634 if (sk->retransmits)
3635 {3636 /*3637 * We were retransmitting. don't count this in RTT est 3638 */3639 flag |= 2;
3640
3641 /*3642 * even though we've gotten an ack, we're still3643 * retransmitting as long as we're sending from3644 * the retransmit queue. Keeping retransmits non-zero3645 * prevents us from getting new data interspersed with3646 * retransmissions.3647 */3648
3649 if (sk->send_head->link3) /* Any more queued retransmits? */3650 sk->retransmits = 1;
3651 else3652 sk->retransmits = 0;
3653 }3654 /*3655 * Note that we only reset backoff and rto in the3656 * rtt recomputation code. And that doesn't happen3657 * if there were retransmissions in effect. So the3658 * first new packet after the retransmissions is3659 * sent with the backoff still in effect. Not until3660 * we get an ack from a non-retransmitted packet do3661 * we reset the backoff and rto. This allows us to deal3662 * with a situation where the network delay has increased3663 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3664 */3665
3666 /*3667 * We have one less packet out there. 3668 */3669
3670 if (sk->packets_out > 0)
3671 sk->packets_out --;
3672 /* 3673 * Wake up the process, it can probably write more. 3674 */3675 if (!sk->dead)
3676 sk->write_space(sk);
3677 oskb = sk->send_head;
3678
3679 if (!(flag&2)) /* Not retransmitting */3680 {3681 longm;
3682
3683 /*3684 * The following amusing code comes from Jacobson's3685 * article in SIGCOMM '88. Note that rtt and mdev3686 * are scaled versions of rtt and mean deviation.3687 * This is designed to be as fast as possible 3688 * m stands for "measurement".3689 */3690
3691 m = jiffies - oskb->when; /* RTT */3692 if(m<=0)
3693 m=1; /* IS THIS RIGHT FOR <0 ??? */3694 m -= (sk->rtt >> 3); /* m is now error in rtt est */3695 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3696 if (m < 0)
3697 m = -m; /* m is now abs(error) */3698 m -= (sk->mdev >> 2); /* similar update on mdev */3699 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3700
3701 /*3702 * Now update timeout. Note that this removes any backoff.3703 */3704
3705 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3706 if (sk->rto > 120*HZ)
3707 sk->rto = 120*HZ;
3708 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3709 sk->rto = 20;
3710 sk->backoff = 0;
3711 }3712 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3713 In this case as we just set it up */3714 cli();
3715 oskb = sk->send_head;
3716 IS_SKB(oskb);
3717 sk->send_head = oskb->link3;
3718 if (sk->send_head == NULL)
3719 {3720 sk->send_tail = NULL;
3721 }3722
3723 /*3724 * We may need to remove this from the dev send list. 3725 */3726
3727 if (oskb->next)
3728 skb_unlink(oskb);
3729 sti();
3730 kfree_skb(oskb, FREE_WRITE); /* write. */3731 if (!sk->dead)
3732 sk->write_space(sk);
3733 }3734 else3735 {3736 break;
3737 }3738 }3739
3740 /*3741 * XXX someone ought to look at this too.. at the moment, if skb_peek()3742 * returns non-NULL, we complete ignore the timer stuff in the else3743 * clause. We ought to organize the code so that else clause can3744 * (should) be executed regardless, possibly moving the PROBE timer3745 * reset over. The skb_peek() thing should only move stuff to the3746 * write queue, NOT also manage the timer functions.3747 */3748
3749 /*3750 * Maybe we can take some stuff off of the write queue,3751 * and put it onto the xmit queue.3752 */3753 if (skb_peek(&sk->write_queue) != NULL)
3754 {3755 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3756 (sk->retransmits == 0 ||
3757 sk->ip_xmit_timeout != TIME_WRITE ||
3758 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3759 && sk->packets_out < sk->cong_window)
3760 {3761 /*3762 * Add more data to the send queue.3763 */3764 flag |= 1;
3765 tcp_write_xmit(sk);
3766 }3767 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3768 sk->send_head == NULL &&
3769 sk->ack_backlog == 0 &&
3770 sk->state != TCP_TIME_WAIT)
3771 {3772 /*3773 * Data to queue but no room.3774 */3775 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3776 }3777 }3778 else3779 {3780 /*3781 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3782 * from TCP_CLOSE we don't do anything3783 *3784 * from anything else, if there is write data (or fin) pending,3785 * we use a TIME_WRITE timeout, else if keepalive we reset to3786 * a KEEPALIVE timeout, else we delete the timer.3787 *3788 * We do not set flag for nominal write data, otherwise we may3789 * force a state where we start to write itsy bitsy tidbits3790 * of data.3791 */3792
3793 switch(sk->state) {3794 caseTCP_TIME_WAIT:
3795 /*3796 * keep us in TIME_WAIT until we stop getting packets,3797 * reset the timeout.3798 */3799 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3800 break;
3801 caseTCP_CLOSE:
3802 /*3803 * don't touch the timer.3804 */3805 break;
3806 default:
3807 /*3808 * Must check send_head, write_queue, and ack_backlog3809 * to determine which timeout to use.3810 */3811 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3812 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3813 }elseif (sk->keepopen) {3814 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3815 }else{3816 del_timer(&sk->retransmit_timer);
3817 sk->ip_xmit_timeout = 0;
3818 }3819 break;
3820 }3821 }3822
3823 /*3824 * We have nothing queued but space to send. Send any partial3825 * packets immediately (end of Nagle rule application).3826 */3827
3828 if (sk->packets_out == 0 && sk->partial != NULL &&
3829 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3830 {3831 flag |= 1;
3832 tcp_send_partial(sk);
3833 }3834
3835 /*3836 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3837 * we are now waiting for an acknowledge to our FIN. The other end is3838 * already in TIME_WAIT.3839 *3840 * Move to TCP_CLOSE on success.3841 */3842
3843 if (sk->state == TCP_LAST_ACK)
3844 {3845 if (!sk->dead)
3846 sk->state_change(sk);
3847 if(sk->debug)
3848 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3849 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3850 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3851 {3852 flag |= 1;
3853 tcp_set_state(sk,TCP_CLOSE);
3854 sk->shutdown = SHUTDOWN_MASK;
3855 }3856 }3857
3858 /*3859 * Incoming ACK to a FIN we sent in the case of our initiating the close.3860 *3861 * Move to FIN_WAIT2 to await a FIN from the other end. Set3862 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3863 */3864
3865 if (sk->state == TCP_FIN_WAIT1)
3866 {3867
3868 if (!sk->dead)
3869 sk->state_change(sk);
3870 if (sk->rcv_ack_seq == sk->write_seq)
3871 {3872 flag |= 1;
3873 sk->shutdown |= SEND_SHUTDOWN;
3874 tcp_set_state(sk, TCP_FIN_WAIT2);
3875 }3876 }3877
3878 /*3879 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3880 *3881 * Move to TIME_WAIT3882 */3883
3884 if (sk->state == TCP_CLOSING)
3885 {3886
3887 if (!sk->dead)
3888 sk->state_change(sk);
3889 if (sk->rcv_ack_seq == sk->write_seq)
3890 {3891 flag |= 1;
3892 tcp_time_wait(sk);
3893 }3894 }3895
3896 /*3897 * Final ack of a three way shake 3898 */3899
3900 if(sk->state==TCP_SYN_RECV)
3901 {3902 tcp_set_state(sk, TCP_ESTABLISHED);
3903 tcp_options(sk,th);
3904 sk->dummy_th.dest=th->source;
3905 sk->copied_seq = sk->acked_seq;
3906 if(!sk->dead)
3907 sk->state_change(sk);
3908 if(sk->max_window==0)
3909 {3910 sk->max_window=32; /* Sanity check */3911 sk->mss=min(sk->max_window,sk->mtu);
3912 }3913 }3914
3915 /*3916 * I make no guarantees about the first clause in the following3917 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3918 * what conditions "!flag" would be true. However I think the rest3919 * of the conditions would prevent that from causing any3920 * unnecessary retransmission. 3921 * Clearly if the first packet has expired it should be 3922 * retransmitted. The other alternative, "flag&2 && retransmits", is3923 * harder to explain: You have to look carefully at how and when the3924 * timer is set and with what timeout. The most recent transmission always3925 * sets the timer. So in general if the most recent thing has timed3926 * out, everything before it has as well. So we want to go ahead and3927 * retransmit some more. If we didn't explicitly test for this3928 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3929 * would not be true. If you look at the pattern of timing, you can3930 * show that rto is increased fast enough that the next packet would3931 * almost never be retransmitted immediately. Then you'd end up3932 * waiting for a timeout to send each packet on the retransmission3933 * queue. With my implementation of the Karn sampling algorithm,3934 * the timeout would double each time. The net result is that it would3935 * take a hideous amount of time to recover from a single dropped packet.3936 * It's possible that there should also be a test for TIME_WRITE, but3937 * I think as long as "send_head != NULL" and "retransmit" is on, we've3938 * got to be in real retransmission mode.3939 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3940 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3941 * As long as no further losses occur, this seems reasonable.3942 */3943
3944 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3945 (((flag&2) && sk->retransmits) ||
3946 (sk->send_head->when + sk->rto < jiffies)))
3947 {3948 if(sk->send_head->when + sk->rto < jiffies)
3949 tcp_retransmit(sk,0);
3950 else3951 {3952 tcp_do_retransmit(sk, 1);
3953 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3954 }3955 }3956
3957 return(1);
3958 }3959
3960
3961 /*3962 * Process the FIN bit. This now behaves as it is supposed to work3963 * and the FIN takes effect when it is validly part of sequence3964 * space. Not before when we get holes.3965 *3966 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3967 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3968 * TIME-WAIT)3969 *3970 * If we are in FINWAIT-1, a received FIN indicates simultaneous3971 * close and we go into CLOSING (and later onto TIME-WAIT)3972 *3973 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3974 *3975 */3976
3977 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3978 {3979 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3980
3981 if (!sk->dead)
3982 {3983 sk->state_change(sk);
3984 sock_wake_async(sk->socket, 1);
3985 }3986
3987 switch(sk->state)
3988 {3989 caseTCP_SYN_RECV:
3990 caseTCP_SYN_SENT:
3991 caseTCP_ESTABLISHED:
3992 /*3993 * move to CLOSE_WAIT, tcp_data() already handled3994 * sending the ack.3995 */3996 tcp_set_state(sk,TCP_CLOSE_WAIT);
3997 if (th->rst)
3998 sk->shutdown = SHUTDOWN_MASK;
3999 break;
4000
4001 caseTCP_CLOSE_WAIT:
4002 caseTCP_CLOSING:
4003 /*4004 * received a retransmission of the FIN, do4005 * nothing.4006 */4007 break;
4008 caseTCP_TIME_WAIT:
4009 /*4010 * received a retransmission of the FIN,4011 * restart the TIME_WAIT timer.4012 */4013 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4014 return(0);
4015 caseTCP_FIN_WAIT1:
4016 /*4017 * This case occurs when a simultaneous close4018 * happens, we must ack the received FIN and4019 * enter the CLOSING state.4020 *4021 * This causes a WRITE timeout, which will either4022 * move on to TIME_WAIT when we timeout, or resend4023 * the FIN properly (maybe we get rid of that annoying4024 * FIN lost hang). The TIME_WRITE code is already correct4025 * for handling this timeout.4026 */4027
4028 if(sk->ip_xmit_timeout != TIME_WRITE)
4029 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4030 tcp_set_state(sk,TCP_CLOSING);
4031 break;
4032 caseTCP_FIN_WAIT2:
4033 /*4034 * received a FIN -- send ACK and enter TIME_WAIT4035 */4036 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4037 sk->shutdown|=SHUTDOWN_MASK;
4038 tcp_set_state(sk,TCP_TIME_WAIT);
4039 break;
4040 caseTCP_CLOSE:
4041 /*4042 * already in CLOSE4043 */4044 break;
4045 default:
4046 tcp_set_state(sk,TCP_LAST_ACK);
4047
4048 /* Start the timers. */4049 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4050 return(0);
4051 }4052
4053 return(0);
4054 }4055
4056
4057
4058 /*4059 * This routine handles the data. If there is room in the buffer,4060 * it will be have already been moved into it. If there is no4061 * room, then we will just have to discard the packet.4062 */4063
4064 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */4065 unsignedlongsaddr, unsignedshortlen)
4066 {4067 structsk_buff *skb1, *skb2;
4068 structtcphdr *th;
4069 intdup_dumped=0;
4070 u32new_seq, shut_seq;
4071
4072 th = skb->h.th;
4073 skb_pull(skb,th->doff*4);
4074 skb_trim(skb,len-(th->doff*4));
4075
4076 /*4077 * The bytes in the receive read/assembly queue has increased. Needed for the4078 * low memory discard algorithm 4079 */4080
4081 sk->bytes_rcv += skb->len;
4082
4083 if (skb->len == 0 && !th->fin)
4084 {4085 /* 4086 * Don't want to keep passing ack's back and forth. 4087 * (someone sent us dataless, boring frame)4088 */4089 if (!th->ack)
4090 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4091 kfree_skb(skb, FREE_READ);
4092 return(0);
4093 }4094
4095 /*4096 * We no longer have anyone receiving data on this connection.4097 */4098
4099 #ifndef TCP_DONT_RST_SHUTDOWN
4100
4101 if(sk->shutdown & RCV_SHUTDOWN)
4102 {4103 /*4104 * FIXME: BSD has some magic to avoid sending resets to4105 * broken 4.2 BSD keepalives. Much to my surprise a few non4106 * BSD stacks still have broken keepalives so we want to4107 * cope with it.4108 */4109
4110 if(skb->len) /* We don't care if it's just an ack or4111 a keepalive/window probe */4112 {4113 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */4114
4115 /* Do this the way 4.4BSD treats it. Not what I'd4116 regard as the meaning of the spec but it's what BSD4117 does and clearly they know everything 8) */4118
4119 /*4120 * This is valid because of two things4121 *4122 * a) The way tcp_data behaves at the bottom.4123 * b) A fin takes effect when read not when received.4124 */4125
4126 shut_seq=sk->acked_seq+1; /* Last byte */4127
4128 if(after(new_seq,shut_seq))
4129 {4130 if(sk->debug)
4131 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4132 sk, new_seq, shut_seq, sk->blog);
4133 if(sk->dead)
4134 {4135 sk->acked_seq = new_seq + th->fin;
4136 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4137 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4138 tcp_statistics.TcpEstabResets++;
4139 tcp_set_state(sk,TCP_CLOSE);
4140 sk->err = EPIPE;
4141 sk->shutdown = SHUTDOWN_MASK;
4142 kfree_skb(skb, FREE_READ);
4143 return 0;
4144 }4145 }4146 }4147 }4148
4149 #endif4150
4151 /*4152 * Now we have to walk the chain, and figure out where this one4153 * goes into it. This is set up so that the last packet we received4154 * will be the first one we look at, that way if everything comes4155 * in order, there will be no performance loss, and if they come4156 * out of order we will be able to fit things in nicely.4157 *4158 * [AC: This is wrong. We should assume in order first and then walk4159 * forwards from the first hole based upon real traffic patterns.]4160 * 4161 */4162
4163 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */4164 {4165 skb_queue_head(&sk->receive_queue,skb);
4166 skb1= NULL;
4167 }4168 else4169 {4170 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4171 {4172 if(sk->debug)
4173 {4174 printk("skb1=%p :", skb1);
4175 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4176 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4177 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4178 sk->acked_seq);
4179 }4180
4181 /*4182 * Optimisation: Duplicate frame or extension of previous frame from4183 * same sequence point (lost ack case).4184 * The frame contains duplicate data or replaces a previous frame4185 * discard the previous frame (safe as sk->inuse is set) and put4186 * the new one in its place.4187 */4188
4189 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4190 {4191 skb_append(skb1,skb);
4192 skb_unlink(skb1);
4193 kfree_skb(skb1,FREE_READ);
4194 dup_dumped=1;
4195 skb1=NULL;
4196 break;
4197 }4198
4199 /*4200 * Found where it fits4201 */4202
4203 if (after(th->seq+1, skb1->h.th->seq))
4204 {4205 skb_append(skb1,skb);
4206 break;
4207 }4208
4209 /*4210 * See if we've hit the start. If so insert.4211 */4212 if (skb1 == skb_peek(&sk->receive_queue))
4213 {4214 skb_queue_head(&sk->receive_queue, skb);
4215 break;
4216 }4217 }4218 }4219
4220 /*4221 * Figure out what the ack value for this frame is4222 */4223
4224 th->ack_seq = th->seq + skb->len;
4225 if (th->syn)
4226 th->ack_seq++;
4227 if (th->fin)
4228 th->ack_seq++;
4229
4230 if (before(sk->acked_seq, sk->copied_seq))
4231 {4232 printk("*** tcp.c:tcp_data bug acked < copied\n");
4233 sk->acked_seq = sk->copied_seq;
4234 }4235
4236 /*4237 * Now figure out if we can ack anything. This is very messy because we really want two4238 * receive queues, a completed and an assembly queue. We also want only one transmit4239 * queue.4240 */4241
4242 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
4243 {4244 if (before(th->seq, sk->acked_seq+1))
4245 {4246 intnewwindow;
4247
4248 if (after(th->ack_seq, sk->acked_seq))
4249 {4250 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4251 if (newwindow < 0)
4252 newwindow = 0;
4253 sk->window = newwindow;
4254 sk->acked_seq = th->ack_seq;
4255 }4256 skb->acked = 1;
4257
4258 /*4259 * When we ack the fin, we do the FIN 4260 * processing.4261 */4262
4263 if (skb->h.th->fin)
4264 {4265 tcp_fin(skb,sk,skb->h.th);
4266 }4267
4268 for(skb2 = skb->next;
4269 skb2 != (structsk_buff *)&sk->receive_queue;
4270 skb2 = skb2->next)
4271 {4272 if (before(skb2->h.th->seq, sk->acked_seq+1))
4273 {4274 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4275 {4276 newwindow = sk->window -
4277 (skb2->h.th->ack_seq - sk->acked_seq);
4278 if (newwindow < 0)
4279 newwindow = 0;
4280 sk->window = newwindow;
4281 sk->acked_seq = skb2->h.th->ack_seq;
4282 }4283 skb2->acked = 1;
4284 /*4285 * When we ack the fin, we do4286 * the fin handling.4287 */4288 if (skb2->h.th->fin)
4289 {4290 tcp_fin(skb,sk,skb->h.th);
4291 }4292
4293 /*4294 * Force an immediate ack.4295 */4296
4297 sk->ack_backlog = sk->max_ack_backlog;
4298 }4299 else4300 {4301 break;
4302 }4303 }4304
4305 /*4306 * This also takes care of updating the window.4307 * This if statement needs to be simplified.4308 */4309 if (!sk->delay_acks ||
4310 sk->ack_backlog >= sk->max_ack_backlog ||
4311 sk->bytes_rcv > sk->max_unacked || th->fin) {4312 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4313 }4314 else4315 {4316 sk->ack_backlog++;
4317 if(sk->debug)
4318 printk("Ack queued.\n");
4319 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4320 }4321 }4322 }4323
4324 /*4325 * If we've missed a packet, send an ack.4326 * Also start a timer to send another.4327 */4328
4329 if (!skb->acked)
4330 {4331
4332 /*4333 * This is important. If we don't have much room left,4334 * we need to throw out a few packets so we have a good4335 * window. Note that mtu is used, not mss, because mss is really4336 * for the send side. He could be sending us stuff as large as mtu.4337 */4338
4339 while (sock_rspace(sk) < sk->mtu)
4340 {4341 skb1 = skb_peek(&sk->receive_queue);
4342 if (skb1 == NULL)
4343 {4344 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4345 break;
4346 }4347
4348 /*4349 * Don't throw out something that has been acked. 4350 */4351
4352 if (skb1->acked)
4353 {4354 break;
4355 }4356
4357 skb_unlink(skb1);
4358 kfree_skb(skb1, FREE_READ);
4359 }4360 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4361 sk->ack_backlog++;
4362 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4363 }4364 else4365 {4366 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4367 }4368
4369 /*4370 * Now tell the user we may have some data. 4371 */4372
4373 if (!sk->dead)
4374 {4375 if(sk->debug)
4376 printk("Data wakeup.\n");
4377 sk->data_ready(sk,0);
4378 }4379 return(0);
4380 }4381
4382
4383 /*4384 * This routine is only called when we have urgent data4385 * signalled. Its the 'slow' part of tcp_urg. It could be4386 * moved inline now as tcp_urg is only called from one4387 * place. We handle URGent data wrong. We have to - as4388 * BSD still doesn't use the correction from RFC961.4389 */4390
4391 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4392 {4393 u32ptr = ntohs(th->urg_ptr);
4394
4395 if (ptr)
4396 ptr--;
4397 ptr += th->seq;
4398
4399 /* ignore urgent data that we've already seen and read */4400 if (after(sk->copied_seq, ptr))
4401 return;
4402
4403 /* do we already have a newer (or duplicate) urgent pointer? */4404 if (sk->urg_data && !after(ptr, sk->urg_seq))
4405 return;
4406
4407 /* tell the world about our new urgent pointer */4408 if (sk->proc != 0) {4409 if (sk->proc > 0) {4410 kill_proc(sk->proc, SIGURG, 1);
4411 }else{4412 kill_pg(-sk->proc, SIGURG, 1);
4413 }4414 }4415 sk->urg_data = URG_NOTYET;
4416 sk->urg_seq = ptr;
4417 }4418
4419 /*4420 * This is the 'fast' part of urgent handling.4421 */4422
4423 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4424 unsignedlongsaddr, unsignedlonglen)
4425 {4426 u32ptr;
4427
4428 /*4429 * Check if we get a new urgent pointer - normally not 4430 */4431
4432 if (th->urg)
4433 tcp_check_urg(sk,th);
4434
4435 /*4436 * Do we wait for any urgent data? - normally not4437 */4438
4439 if (sk->urg_data != URG_NOTYET)
4440 return 0;
4441
4442 /*4443 * Is the urgent pointer pointing into this packet? 4444 */4445
4446 ptr = sk->urg_seq - th->seq + th->doff*4;
4447 if (ptr >= len)
4448 return 0;
4449
4450 /*4451 * Ok, got the correct packet, update info 4452 */4453
4454 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4455 if (!sk->dead)
4456 sk->data_ready(sk,0);
4457 return 0;
4458 }4459
4460 /*4461 * This will accept the next outstanding connection. 4462 */4463
4464 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4465 {4466 structsock *newsk;
4467 structsk_buff *skb;
4468
4469 /*4470 * We need to make sure that this socket is listening,4471 * and that it has something pending.4472 */4473
4474 if (sk->state != TCP_LISTEN)
4475 {4476 sk->err = EINVAL;
4477 return(NULL);
4478 }4479
4480 /* Avoid the race. */4481 cli();
4482 sk->inuse = 1;
4483
4484 while((skb = tcp_dequeue_established(sk)) == NULL)
4485 {4486 if (flags & O_NONBLOCK)
4487 {4488 sti();
4489 release_sock(sk);
4490 sk->err = EAGAIN;
4491 return(NULL);
4492 }4493
4494 release_sock(sk);
4495 interruptible_sleep_on(sk->sleep);
4496 if (current->signal & ~current->blocked)
4497 {4498 sti();
4499 sk->err = ERESTARTSYS;
4500 return(NULL);
4501 }4502 sk->inuse = 1;
4503 }4504 sti();
4505
4506 /*4507 * Now all we need to do is return skb->sk. 4508 */4509
4510 newsk = skb->sk;
4511
4512 kfree_skb(skb, FREE_READ);
4513 sk->ack_backlog--;
4514 release_sock(sk);
4515 return(newsk);
4516 }4517
4518
4519 /*4520 * This will initiate an outgoing connection. 4521 */4522
4523 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4524 {4525 structsk_buff *buff;
4526 structdevice *dev=NULL;
4527 unsignedchar *ptr;
4528 inttmp;
4529 intatype;
4530 structtcphdr *t1;
4531 structrtable *rt;
4532
4533 if (sk->state != TCP_CLOSE)
4534 {4535 return(-EISCONN);
4536 }4537
4538 if (addr_len < 8)
4539 return(-EINVAL);
4540
4541 if (usin->sin_family && usin->sin_family != AF_INET)
4542 return(-EAFNOSUPPORT);
4543
4544 /*4545 * connect() to INADDR_ANY means loopback (BSD'ism).4546 */4547
4548 if(usin->sin_addr.s_addr==INADDR_ANY)
4549 usin->sin_addr.s_addr=ip_my_addr();
4550
4551 /*4552 * Don't want a TCP connection going to a broadcast address 4553 */4554
4555 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4556 return -ENETUNREACH;
4557
4558 sk->inuse = 1;
4559 sk->daddr = usin->sin_addr.s_addr;
4560 sk->write_seq = tcp_init_seq();
4561 sk->window_seq = sk->write_seq;
4562 sk->rcv_ack_seq = sk->write_seq -1;
4563 sk->err = 0;
4564 sk->dummy_th.dest = usin->sin_port;
4565 release_sock(sk);
4566
4567 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4568 if (buff == NULL)
4569 {4570 return(-ENOMEM);
4571 }4572 sk->inuse = 1;
4573 buff->sk = sk;
4574 buff->free = 0;
4575 buff->localroute = sk->localroute;
4576
4577
4578 /*4579 * Put in the IP header and routing stuff.4580 */4581
4582 if (sk->localroute)
4583 rt=ip_rt_local(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4584 else4585 rt=ip_rt_route(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4586
4587 /*4588 * When we connect we enforce receive requirements too.4589 */4590
4591 sk->rcv_saddr=sk->saddr;
4592
4593 /*4594 * We need to build the routing stuff from the things saved in skb. 4595 */4596
4597 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4598 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4599 if (tmp < 0)
4600 {4601 sock_wfree(sk, buff);
4602 release_sock(sk);
4603 return(-ENETUNREACH);
4604 }4605
4606 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
4607
4608 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4609 t1->seq = ntohl(sk->write_seq++);
4610 sk->sent_seq = sk->write_seq;
4611 buff->h.seq = sk->write_seq;
4612 t1->ack = 0;
4613 t1->window = 2;
4614 t1->res1=0;
4615 t1->res2=0;
4616 t1->rst = 0;
4617 t1->urg = 0;
4618 t1->psh = 0;
4619 t1->syn = 1;
4620 t1->urg_ptr = 0;
4621 t1->doff = 6;
4622 /* use 512 or whatever user asked for */4623
4624 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4625 sk->window_clamp=rt->rt_window;
4626 else4627 sk->window_clamp=0;
4628
4629 if (sk->user_mss)
4630 sk->mtu = sk->user_mss;
4631 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
4632 sk->mtu = rt->rt_mss;
4633 else4634 {4635 #ifdefCONFIG_INET_SNARL4636 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4637 #else4638 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4639 #endif4640 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
4641 else4642 sk->mtu = MAX_WINDOW;
4643 }4644 /*4645 * but not bigger than device MTU 4646 */4647
4648 if(sk->mtu <32)
4649 sk->mtu = 32; /* Sanity limit */4650
4651 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
4652
4653 /*4654 * Put in the TCP options to say MTU. 4655 */4656
4657 ptr = skb_put(buff,4);
4658 ptr[0] = 2;
4659 ptr[1] = 4;
4660 ptr[2] = (sk->mtu) >> 8;
4661 ptr[3] = (sk->mtu) & 0xff;
4662 tcp_send_check(t1, sk->saddr, sk->daddr,
4663 sizeof(structtcphdr) + 4, sk);
4664
4665 /*4666 * This must go first otherwise a really quick response will get reset. 4667 */4668
4669 tcp_cache_zap();
4670 tcp_set_state(sk,TCP_SYN_SENT);
4671 if(rt&&rt->rt_flags&RTF_IRTT)
4672 sk->rto = rt->rt_irtt;
4673 else4674 sk->rto = TCP_TIMEOUT_INIT;
4675 sk->retransmit_timer.function=&retransmit_timer;
4676 sk->retransmit_timer.data = (unsignedlong)sk;
4677 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4678 sk->retransmits = 0; /* Now works the right way instead of a hacked initial setting */4679
4680 sk->prot->queue_xmit(sk, dev, buff, 0);
4681 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4682 tcp_statistics.TcpActiveOpens++;
4683 tcp_statistics.TcpOutSegs++;
4684
4685 release_sock(sk);
4686 return(0);
4687 }4688
4689
4690 /* This functions checks to see if the tcp header is actually acceptable. */4691 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4692 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4693 {4694 u32next_seq;
4695
4696 next_seq = len - 4*th->doff;
4697 if (th->fin)
4698 next_seq++;
4699 /* if we have a zero window, we can't have any data in the packet.. */4700 if (next_seq && !sk->window)
4701 gotoignore_it;
4702 next_seq += th->seq;
4703
4704 /*4705 * This isn't quite right. sk->acked_seq could be more recent4706 * than sk->window. This is however close enough. We will accept4707 * slightly more packets than we should, but it should not cause4708 * problems unless someone is trying to forge packets.4709 */4710
4711 /* have we already seen all of this packet? */4712 if (!after(next_seq+1, sk->acked_seq))
4713 gotoignore_it;
4714 /* or does it start beyond the window? */4715 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4716 gotoignore_it;
4717
4718 /* ok, at least part of this packet would seem interesting.. */4719 return 1;
4720
4721 ignore_it:
4722 if (th->rst)
4723 return 0;
4724
4725 /*4726 * Send a reset if we get something not ours and we are4727 * unsynchronized. Note: We don't do anything to our end. We4728 * are just killing the bogus remote connection then we will4729 * connect again and it will work (with luck).4730 */4731
4732 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4733 {4734 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4735 return 1;
4736 }4737
4738 /* Try to resync things. */4739 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4740 return 0;
4741 }4742
4743 /*4744 * When we get a reset we do this.4745 */4746
4747 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4748 {4749 sk->zapped = 1;
4750 sk->err = ECONNRESET;
4751 if (sk->state == TCP_SYN_SENT)
4752 sk->err = ECONNREFUSED;
4753 if (sk->state == TCP_CLOSE_WAIT)
4754 sk->err = EPIPE;
4755 #ifdef TCP_DO_RFC1337
4756 /*4757 * Time wait assassination protection [RFC1337]4758 */4759 if(sk->state!=TCP_TIME_WAIT)
4760 {4761 tcp_set_state(sk,TCP_CLOSE);
4762 sk->shutdown = SHUTDOWN_MASK;
4763 }4764 #else4765 tcp_set_state(sk,TCP_CLOSE);
4766 sk->shutdown = SHUTDOWN_MASK;
4767 #endif4768 if (!sk->dead)
4769 sk->state_change(sk);
4770 kfree_skb(skb, FREE_READ);
4771 release_sock(sk);
4772 return(0);
4773 }4774
4775 /*4776 * A TCP packet has arrived.4777 * skb->h.raw is the TCP header.4778 */4779
4780 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4781 __u32daddr, unsignedshortlen,
4782 __u32saddr, intredo, structinet_protocol * protocol)
4783 {4784 structtcphdr *th;
4785 structsock *sk;
4786 intsyn_ok=0;
4787
4788 tcp_statistics.TcpInSegs++;
4789 if(skb->pkt_type!=PACKET_HOST)
4790 {4791 kfree_skb(skb,FREE_READ);
4792 return(0);
4793 }4794
4795 th = skb->h.th;
4796
4797 /*4798 * Find the socket, using the last hit cache if applicable.4799 */4800
4801 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4802 {4803 sk=(structsock *)th_cache_sk;
4804 /*4805 * We think this is causing the bug so4806 */4807 if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4808 printk("Cache mismatch on TCP.\n");
4809 }4810 else4811 {4812 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4813 th_cache_saddr=saddr;
4814 th_cache_daddr=daddr;
4815 th_cache_dport=th->dest;
4816 th_cache_sport=th->source;
4817 th_cache_sk=sk;
4818 }4819
4820 /*4821 * If this socket has got a reset it's to all intents and purposes 4822 * really dead. Count closed sockets as dead.4823 *4824 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4825 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4826 * exist so should cause resets as if the port was unreachable.4827 */4828
4829 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4830 sk=NULL;
4831
4832 if (!redo)
4833 {4834 /*4835 * Pull up the IP header.4836 */4837 skb_pull(skb, skb->h.raw-skb->data);
4838 /*4839 * Try to use the device checksum if provided.4840 */4841 if (
4842 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4843 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4844 )
4845 {4846 skb->sk = NULL;
4847 kfree_skb(skb,FREE_READ);
4848 /*4849 * We don't release the socket because it was4850 * never marked in use.4851 */4852 return(0);
4853 }4854 th->seq = ntohl(th->seq);
4855
4856 /* See if we know about the socket. */4857 if (sk == NULL)
4858 {4859 /*4860 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4861 */4862 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4863 skb->sk = NULL;
4864 /*4865 * Discard frame4866 */4867 kfree_skb(skb, FREE_READ);
4868 return(0);
4869 }4870
4871 /* skb->len = len;*/4872 skb->acked = 0;
4873 skb->used = 0;
4874 skb->free = 0;
4875 skb->saddr = daddr;
4876 skb->daddr = saddr;
4877
4878 /* We may need to add it to the backlog here. */4879 cli();
4880 if (sk->inuse)
4881 {4882 skb_queue_tail(&sk->back_log, skb);
4883 sti();
4884 return(0);
4885 }4886 sk->inuse = 1;
4887 sti();
4888 }4889 else4890 {4891 if (sk==NULL)
4892 {4893 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4894 skb->sk = NULL;
4895 kfree_skb(skb, FREE_READ);
4896 return(0);
4897 }4898 }4899
4900
4901 if (!sk->prot)
4902 {4903 printk("IMPOSSIBLE 3\n");
4904 return(0);
4905 }4906
4907
4908 /*4909 * Charge the memory to the socket. 4910 */4911
4912 if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf)
4913 {4914 kfree_skb(skb, FREE_READ);
4915 release_sock(sk);
4916 return(0);
4917 }4918
4919 skb->sk=sk;
4920 sk->rmem_alloc += skb->truesize;
4921
4922 /*4923 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4924 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4925 * compatibility. We also set up variables more thoroughly [Karn notes in the4926 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4927 */4928
4929 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4930 {4931
4932 /*4933 * Now deal with unusual cases.4934 */4935
4936 if(sk->state==TCP_LISTEN)
4937 {4938 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4939 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4940
4941 /*4942 * We don't care for RST, and non SYN are absorbed (old segments)4943 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4944 * netmask on a running connection it can go broadcast. Even Sun's have4945 * this problem so I'm ignoring it 4946 */4947
4948 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4949 {4950 kfree_skb(skb, FREE_READ);
4951 release_sock(sk);
4952 return 0;
4953 }4954
4955 /* 4956 * Guess we need to make a new socket up 4957 */4958
4959 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4960
4961 /*4962 * Now we have several options: In theory there is nothing else4963 * in the frame. KA9Q has an option to send data with the syn,4964 * BSD accepts data with the syn up to the [to be] advertised window4965 * and Solaris 2.1 gives you a protocol error. For now we just ignore4966 * it, that fits the spec precisely and avoids incompatibilities. It4967 * would be nice in future to drop through and process the data.4968 */4969
4970 release_sock(sk);
4971 return 0;
4972 }4973
4974 /* retransmitted SYN? */4975 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4976 {4977 kfree_skb(skb, FREE_READ);
4978 release_sock(sk);
4979 return 0;
4980 }4981
4982 /*4983 * SYN sent means we have to look for a suitable ack and either reset4984 * for bad matches or go to connected 4985 */4986
4987 if(sk->state==TCP_SYN_SENT)
4988 {4989 /* Crossed SYN or previous junk segment */4990 if(th->ack)
4991 {4992 /* We got an ack, but it's not a good ack */4993 if(!tcp_ack(sk,th,saddr,len))
4994 {4995 /* Reset the ack - its an ack from a 4996 different connection [ th->rst is checked in tcp_reset()] */4997 tcp_statistics.TcpAttemptFails++;
4998 tcp_reset(daddr, saddr, th,
4999 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5000 kfree_skb(skb, FREE_READ);
5001 release_sock(sk);
5002 return(0);
5003 }5004 if(th->rst)
5005 returntcp_std_reset(sk,skb);
5006 if(!th->syn)
5007 {5008 /* A valid ack from a different connection5009 start. Shouldn't happen but cover it */5010 kfree_skb(skb, FREE_READ);
5011 release_sock(sk);
5012 return 0;
5013 }5014 /*5015 * Ok.. it's good. Set up sequence numbers and5016 * move to established.5017 */5018 syn_ok=1; /* Don't reset this connection for the syn */5019 sk->acked_seq=th->seq+1;
5020 sk->fin_seq=th->seq;
5021 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5022 tcp_set_state(sk, TCP_ESTABLISHED);
5023 tcp_options(sk,th);
5024 sk->dummy_th.dest=th->source;
5025 sk->copied_seq = sk->acked_seq;
5026 if(!sk->dead)
5027 {5028 sk->state_change(sk);
5029 sock_wake_async(sk->socket, 0);
5030 }5031 if(sk->max_window==0)
5032 {5033 sk->max_window = 32;
5034 sk->mss = min(sk->max_window, sk->mtu);
5035 }5036 }5037 else5038 {5039 /* See if SYN's cross. Drop if boring */5040 if(th->syn && !th->rst)
5041 {5042 /* Crossed SYN's are fine - but talking to5043 yourself is right out... */5044 if(sk->saddr==saddr && sk->daddr==daddr &&
5045 sk->dummy_th.source==th->source &&
5046 sk->dummy_th.dest==th->dest)
5047 {5048 tcp_statistics.TcpAttemptFails++;
5049 returntcp_std_reset(sk,skb);
5050 }5051 tcp_set_state(sk,TCP_SYN_RECV);
5052
5053 /*5054 * FIXME:5055 * Must send SYN|ACK here5056 */5057 }5058 /* Discard junk segment */5059 kfree_skb(skb, FREE_READ);
5060 release_sock(sk);
5061 return 0;
5062 }5063 /*5064 * SYN_RECV with data maybe.. drop through5065 */5066 gotorfc_step6;
5067 }5068
5069 /*5070 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is5071 * a more complex suggestion for fixing these reuse issues in RFC16445072 * but not yet ready for general use. Also see RFC1379.5073 */5074
5075 #defineBSD_TIME_WAIT5076 #ifdefBSD_TIME_WAIT5077 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5078 after(th->seq, sk->acked_seq) && !th->rst)
5079 {5080 u32seq = sk->write_seq;
5081 if(sk->debug)
5082 printk("Doing a BSD time wait\n");
5083 tcp_statistics.TcpEstabResets++;
5084 sk->rmem_alloc -= skb->truesize;
5085 skb->sk = NULL;
5086 sk->err=ECONNRESET;
5087 tcp_set_state(sk, TCP_CLOSE);
5088 sk->shutdown = SHUTDOWN_MASK;
5089 release_sock(sk);
5090 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5091 if (sk && sk->state==TCP_LISTEN)
5092 {5093 sk->inuse=1;
5094 skb->sk = sk;
5095 sk->rmem_alloc += skb->truesize;
5096 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5097 release_sock(sk);
5098 return 0;
5099 }5100 kfree_skb(skb, FREE_READ);
5101 return 0;
5102 }5103 #endif5104 }5105
5106 /*5107 * We are now in normal data flow (see the step list in the RFC)5108 * Note most of these are inline now. I'll inline the lot when5109 * I have time to test it hard and look at what gcc outputs 5110 */5111
5112 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5113 {5114 kfree_skb(skb, FREE_READ);
5115 release_sock(sk);
5116 return 0;
5117 }5118
5119 if(th->rst)
5120 returntcp_std_reset(sk,skb);
5121
5122 /*5123 * !syn_ok is effectively the state test in RFC793.5124 */5125
5126 if(th->syn && !syn_ok)
5127 {5128 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5129 returntcp_std_reset(sk,skb);
5130 }5131
5132 /*5133 * Process the ACK5134 */5135
5136
5137 if(th->ack && !tcp_ack(sk,th,saddr,len))
5138 {5139 /*5140 * Our three way handshake failed.5141 */5142
5143 if(sk->state==TCP_SYN_RECV)
5144 {5145 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5146 }5147 kfree_skb(skb, FREE_READ);
5148 release_sock(sk);
5149 return 0;
5150 }5151
5152 rfc_step6: /* I'll clean this up later */5153
5154 /*5155 * Process urgent data5156 */5157
5158 if(tcp_urg(sk, th, saddr, len))
5159 {5160 kfree_skb(skb, FREE_READ);
5161 release_sock(sk);
5162 return 0;
5163 }5164
5165
5166 /*5167 * Process the encapsulated data5168 */5169
5170 if(tcp_data(skb,sk, saddr, len))
5171 {5172 kfree_skb(skb, FREE_READ);
5173 release_sock(sk);
5174 return 0;
5175 }5176
5177 /*5178 * And done5179 */5180
5181 release_sock(sk);
5182 return 0;
5183 }5184
5185 /*5186 * This routine sends a packet with an out of date sequence5187 * number. It assumes the other end will try to ack it.5188 */5189
5190 staticvoidtcp_write_wakeup(structsock *sk)
/* */5191 {5192 structsk_buff *buff,*skb;
5193 structtcphdr *t1;
5194 structdevice *dev=NULL;
5195 inttmp;
5196
5197 if (sk->zapped)
5198 return; /* After a valid reset we can send no more */5199
5200 /*5201 * Write data can still be transmitted/retransmitted in the5202 * following states. If any other state is encountered, return.5203 * [listen/close will never occur here anyway]5204 */5205
5206 if (sk->state != TCP_ESTABLISHED &&
5207 sk->state != TCP_CLOSE_WAIT &&
5208 sk->state != TCP_FIN_WAIT1 &&
5209 sk->state != TCP_LAST_ACK &&
5210 sk->state != TCP_CLOSING5211 )
5212 {5213 return;
5214 }5215 if ( before(sk->sent_seq, sk->window_seq) &&
5216 (skb=skb_peek(&sk->write_queue)))
5217 {5218 /*5219 * We are probing the opening of a window5220 * but the window size is != 05221 * must have been a result SWS advoidance ( sender )5222 */5223
5224 structiphdr *iph;
5225 structtcphdr *th;
5226 structtcphdr *nth;
5227 unsignedlongwin_size;
5228 #if 0
5229 unsignedlongow_size;
5230 #endif5231 void * tcp_data_start;
5232
5233 /*5234 * How many bytes can we send ?5235 */5236
5237 win_size = sk->window_seq - sk->sent_seq;
5238
5239 /*5240 * Recover the buffer pointers5241 */5242
5243 iph = (structiphdr *)skb->ip_hdr;
5244 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
5245
5246 /*5247 * Grab the data for a temporary frame5248 */5249
5250 buff = sock_wmalloc(sk, win_size + th->doff * 4 +
5251 (iph->ihl << 2) +
5252 sk->prot->max_header + 15,
5253 1, GFP_ATOMIC);
5254 if ( buff == NULL )
5255 return;
5256
5257 /* 5258 * If we strip the packet on the write queue we must5259 * be ready to retransmit this one 5260 */5261
5262 buff->free = /*0*/1;
5263
5264 buff->sk = sk;
5265 buff->localroute = sk->localroute;
5266
5267 /*5268 * Put headers on the new packet5269 */5270
5271 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5272 IPPROTO_TCP, sk->opt, buff->truesize,
5273 sk->ip_tos,sk->ip_ttl);
5274 if (tmp < 0)
5275 {5276 sock_wfree(sk, buff);
5277 return;
5278 }5279
5280 /*5281 * Move the TCP header over5282 */5283
5284 buff->dev = dev;
5285
5286 nth = (structtcphdr *) skb_put(buff,th->doff*4);
5287
5288 memcpy(nth, th, th->doff * 4);
5289
5290 /*5291 * Correct the new header5292 */5293
5294 nth->ack = 1;
5295 nth->ack_seq = ntohl(sk->acked_seq);
5296 nth->window = ntohs(tcp_select_window(sk));
5297 nth->check = 0;
5298
5299 /*5300 * Find the first data byte.5301 */5302
5303 tcp_data_start = skb->data + skb->dev->hard_header_len +
5304 (iph->ihl << 2) + th->doff * 4;
5305
5306 /*5307 * Add it to our new buffer5308 */5309 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5310
5311 /*5312 * Remember our right edge sequence number.5313 */5314
5315 buff->h.seq = sk->sent_seq + win_size;
5316 sk->sent_seq = buff->h.seq; /* Hack */5317 #if 0
5318
5319 /*5320 * now: shrink the queue head segment 5321 */5322
5323 th->check = 0;
5324 ow_size = skb->len - win_size -
5325 ((unsignedlong) (tcp_data_start - (void *) skb->data));
5326
5327 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5328 skb_trim(skb,skb->len-win_size);
5329 sk->sent_seq += win_size;
5330 th->seq = htonl(sk->sent_seq);
5331 if (th->urg)
5332 {5333 unsignedshorturg_ptr;
5334
5335 urg_ptr = ntohs(th->urg_ptr);
5336 if (urg_ptr <= win_size)
5337 th->urg = 0;
5338 else5339 {5340 urg_ptr -= win_size;
5341 th->urg_ptr = htons(urg_ptr);
5342 nth->urg_ptr = htons(win_size);
5343 }5344 }5345 #else5346 if(th->urg && ntohs(th->urg_ptr) < win_size)
5347 nth->urg = 0;
5348 #endif5349
5350 /*5351 * Checksum the split buffer5352 */5353
5354 tcp_send_check(nth, sk->saddr, sk->daddr,
5355 nth->doff * 4 + win_size , sk);
5356 }5357 else5358 {5359 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5360 if (buff == NULL)
5361 return;
5362
5363 buff->free = 1;
5364 buff->sk = sk;
5365 buff->localroute = sk->localroute;
5366
5367 /*5368 * Put in the IP header and routing stuff. 5369 */5370
5371 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5372 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5373 if (tmp < 0)
5374 {5375 sock_wfree(sk, buff);
5376 return;
5377 }5378
5379 t1 = (structtcphdr *)skb_put(buff,sizeof(structtcphdr));
5380 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5381
5382 /*5383 * Use a previous sequence.5384 * This should cause the other end to send an ack.5385 */5386
5387 t1->seq = htonl(sk->sent_seq-1);
5388 t1->ack = 1;
5389 t1->res1= 0;
5390 t1->res2= 0;
5391 t1->rst = 0;
5392 t1->urg = 0;
5393 t1->psh = 0;
5394 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5395 t1->syn = 0;
5396 t1->ack_seq = ntohl(sk->acked_seq);
5397 t1->window = ntohs(tcp_select_window(sk));
5398 t1->doff = sizeof(*t1)/4;
5399 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5400
5401 }5402
5403 /*5404 * Send it.5405 */5406
5407 sk->prot->queue_xmit(sk, dev, buff, 1);
5408 tcp_statistics.TcpOutSegs++;
5409 }5410
5411 /*5412 * A window probe timeout has occurred.5413 */5414
5415 voidtcp_send_probe0(structsock *sk)
/* */5416 {5417 if (sk->zapped)
5418 return; /* After a valid reset we can send no more */5419
5420 tcp_write_wakeup(sk);
5421
5422 sk->backoff++;
5423 sk->rto = min(sk->rto << 1, 120*HZ);
5424 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5425 sk->retransmits++;
5426 sk->prot->retransmits ++;
5427 }5428
5429 /*5430 * Socket option code for TCP. 5431 */5432
5433 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5434 {5435 intval,err;
5436
5437 if(level!=SOL_TCP)
5438 returnip_setsockopt(sk,level,optname,optval,optlen);
5439
5440 if (optval == NULL)
5441 return(-EINVAL);
5442
5443 err=verify_area(VERIFY_READ, optval, sizeof(int));
5444 if(err)
5445 returnerr;
5446
5447 val = get_user((int *)optval);
5448
5449 switch(optname)
5450 {5451 caseTCP_MAXSEG:
5452 /*5453 * values greater than interface MTU won't take effect. however at5454 * the point when this call is done we typically don't yet know5455 * which interface is going to be used5456 */5457 if(val<1||val>MAX_WINDOW)
5458 return -EINVAL;
5459 sk->user_mss=val;
5460 return 0;
5461 caseTCP_NODELAY:
5462 sk->nonagle=(val==0)?0:1;
5463 return 0;
5464 default:
5465 return(-ENOPROTOOPT);
5466 }5467 }5468
5469 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5470 {5471 intval,err;
5472
5473 if(level!=SOL_TCP)
5474 returnip_getsockopt(sk,level,optname,optval,optlen);
5475
5476 switch(optname)
5477 {5478 caseTCP_MAXSEG:
5479 val=sk->user_mss;
5480 break;
5481 caseTCP_NODELAY:
5482 val=sk->nonagle;
5483 break;
5484 default:
5485 return(-ENOPROTOOPT);
5486 }5487 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5488 if(err)
5489 returnerr;
5490 put_user(sizeof(int),(int *) optlen);
5491
5492 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5493 if(err)
5494 returnerr;
5495 put_user(val,(int *)optval);
5496
5497 return(0);
5498 }5499
5500
5501 structprototcp_prot = {5502 tcp_close,
5503 tcp_read,
5504 tcp_write,
5505 tcp_sendto,
5506 tcp_recvfrom,
5507 ip_build_header,
5508 tcp_connect,
5509 tcp_accept,
5510 ip_queue_xmit,
5511 tcp_retransmit,
5512 tcp_write_wakeup,
5513 tcp_read_wakeup,
5514 tcp_rcv,
5515 tcp_select,
5516 tcp_ioctl,
5517 NULL,
5518 tcp_shutdown,
5519 tcp_setsockopt,
5520 tcp_getsockopt,
5521 tcp_sendmsg,
5522 tcp_recvmsg,
5523 128,
5524 0,
5525 "TCP",
5526 0, 0,
5527 {NULL,}5528 };