1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_send_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications. 178 * Alan Cox : rcv_saddr errors. 179 * Alan Cox : Block double connect(). 180 * Alan Cox : Small hooks for enSKIP. 181 * Alexey Kuznetsov: Path MTU discovery. 182 * Alan Cox : Support soft errors. 183 * Alan Cox : Fix MTU discovery pathalogical case 184 * when the remote claims no mtu! 185 * Marc Tamsky : TCP_CLOSE fix. 186 * Colin (G3TNE) : Send a reset on syn ack replies in 187 * window but wrong (fixes NT lpd problems) 188 * Pedro Roque : Better TCP window handling, delayed ack. 189 * Joerg Reuter : No modification of locked buffers in 190 * tcp_do_retransmit() 191 * 192 * To Fix: 193 * Fast path the code. Two things here - fix the window calculation 194 * so it doesn't iterate over the queue, also spot packets with no funny 195 * options arriving in order and process directly. 196 * 197 * Rewrite output state machine to use a single queue. 198 * Speed up input assembly algorithm. 199 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 200 * could do with it working on IPv4 201 * User settable/learned rtt/max window/mtu 202 * 203 * Change the fundamental structure to a single send queue maintained 204 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 205 * active routes too]). Cut the queue off in tcp_retransmit/ 206 * tcp_transmit. 207 * Change the receive queue to assemble as it goes. This lets us 208 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 209 * tcp_data/tcp_read as well as the window shrink crud. 210 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 211 * tcp_queue_skb seem obvious routines to extract. 212 * 213 * This program is free software; you can redistribute it and/or 214 * modify it under the terms of the GNU General Public License 215 * as published by the Free Software Foundation; either version 216 * 2 of the License, or(at your option) any later version. 217 * 218 * Description of States: 219 * 220 * TCP_SYN_SENT sent a connection request, waiting for ack 221 * 222 * TCP_SYN_RECV received a connection request, sent ack, 223 * waiting for final ack in three-way handshake. 224 * 225 * TCP_ESTABLISHED connection established 226 * 227 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 228 * transmission of remaining buffered data 229 * 230 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 231 * to shutdown 232 * 233 * TCP_CLOSING both sides have shutdown but we still have 234 * data we have to finish sending 235 * 236 * TCP_TIME_WAIT timeout to catch resent junk before entering 237 * closed, can only be entered from FIN_WAIT2 238 * or CLOSING. Required because the other end 239 * may not have gotten our last ACK causing it 240 * to retransmit the data packet (which we ignore) 241 * 242 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 243 * us to finish writing our data and to shutdown 244 * (we have to close() to move on to LAST_ACK) 245 * 246 * TCP_LAST_ACK out side has shutdown after remote has 247 * shutdown. There may still be data in our 248 * buffer that we have to finish sending 249 * 250 * TCP_CLOSE socket is finished 251 */ 252
253 /* 254 * RFC1122 status: 255 * NOTE: I'm not going to be doing comments in the code for this one except 256 * for violations and the like. tcp.c is just too big... If I say something 257 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 258 * with Alan. -- MS 950903 259 * 260 * Use of PSH (4.2.2.2) 261 * MAY aggregate data sent without the PSH flag. (does) 262 * MAY queue data received without the PSH flag. (does) 263 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 264 * MAY implement PSH on send calls. (doesn't, thus:) 265 * MUST NOT buffer data indefinitely (doesn't [1 second]) 266 * MUST set PSH on last segment (does) 267 * MAY pass received PSH to application layer (doesn't) 268 * SHOULD send maximum-sized segment whenever possible. (almost always does) 269 * 270 * Window Size (4.2.2.3, 4.2.2.16) 271 * MUST treat window size as an unsigned number (does) 272 * SHOULD treat window size as a 32-bit number (does not) 273 * MUST NOT shrink window once it is offered (does not normally) 274 * 275 * Urgent Pointer (4.2.2.4) 276 * **MUST point urgent pointer to last byte of urgent data (not right 277 * after). (doesn't, to be like BSD) 278 * MUST inform application layer asynchronously of incoming urgent 279 * data. (does) 280 * MUST provide application with means of determining the amount of 281 * urgent data pending. (does) 282 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 283 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 284 * [Follows BSD 1 byte of urgent data] 285 * 286 * TCP Options (4.2.2.5) 287 * MUST be able to receive TCP options in any segment. (does) 288 * MUST ignore unsupported options (does) 289 * 290 * Maximum Segment Size Option (4.2.2.6) 291 * MUST implement both sending and receiving MSS. (does) 292 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send 293 * it always). (does, even when MSS == 536, which is legal) 294 * MUST assume MSS == 536 if no MSS received at connection setup (does) 295 * MUST calculate "effective send MSS" correctly: 296 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 297 * (does - but allows operator override) 298 * 299 * TCP Checksum (4.2.2.7) 300 * MUST generate and check TCP checksum. (does) 301 * 302 * Initial Sequence Number Selection (4.2.2.8) 303 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 304 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 305 * necessary for 10Mbps networks - and harder than BSD to spoof!) 306 * 307 * Simultaneous Open Attempts (4.2.2.10) 308 * MUST support simultaneous open attempts (does) 309 * 310 * Recovery from Old Duplicate SYN (4.2.2.11) 311 * MUST keep track of active vs. passive open (does) 312 * 313 * RST segment (4.2.2.12) 314 * SHOULD allow an RST segment to contain data (does, but doesn't do 315 * anything with it, which is standard) 316 * 317 * Closing a Connection (4.2.2.13) 318 * MUST inform application of whether connectin was closed by RST or 319 * normal close. (does) 320 * MAY allow "half-duplex" close (treat connection as closed for the 321 * local app, even before handshake is done). (does) 322 * MUST linger in TIME_WAIT for 2 * MSL (does) 323 * 324 * Retransmission Timeout (4.2.2.15) 325 * MUST implement Jacobson's slow start and congestion avoidance 326 * stuff. (does) 327 * 328 * Probing Zero Windows (4.2.2.17) 329 * MUST support probing of zero windows. (does) 330 * MAY keep offered window closed indefinitely. (does) 331 * MUST allow remote window to stay closed indefinitely. (does) 332 * 333 * Passive Open Calls (4.2.2.18) 334 * MUST NOT let new passive open affect other connections. (doesn't) 335 * MUST support passive opens (LISTENs) concurrently. (does) 336 * 337 * Time to Live (4.2.2.19) 338 * MUST make TCP TTL configurable. (does - IP_TTL option) 339 * 340 * Event Processing (4.2.2.20) 341 * SHOULD queue out-of-order segments. (does) 342 * MUST aggregate ACK segments whenever possible. (does but badly) 343 * 344 * Retransmission Timeout Calculation (4.2.3.1) 345 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 346 * calculation. (does, or at least explains them in the comments 8*b) 347 * SHOULD initialize RTO to 0 and RTT to 3. (does) 348 * 349 * When to Send an ACK Segment (4.2.3.2) 350 * SHOULD implement delayed ACK. (does) 351 * MUST keep ACK delay < 0.5 sec. (does) 352 * 353 * When to Send a Window Update (4.2.3.3) 354 * MUST implement receiver-side SWS. (does) 355 * 356 * When to Send Data (4.2.3.4) 357 * MUST implement sender-side SWS. (does) 358 * SHOULD implement Nagle algorithm. (does) 359 * 360 * TCP Connection Failures (4.2.3.5) 361 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 362 * SHOULD inform application layer of soft errors. (does) 363 * 364 * TCP Keep-Alives (4.2.3.6) 365 * MAY provide keep-alives. (does) 366 * MUST make keep-alives configurable on a per-connection basis. (does) 367 * MUST default to no keep-alives. (does) 368 * **MUST make keep-alive interval configurable. (doesn't) 369 * **MUST make default keep-alive interval > 2 hours. (doesn't) 370 * MUST NOT interpret failure to ACK keep-alive packet as dead 371 * connection. (doesn't) 372 * SHOULD send keep-alive with no data. (does) 373 * 374 * TCP Multihoming (4.2.3.7) 375 * MUST get source address from IP layer before sending first 376 * SYN. (does) 377 * MUST use same local address for all segments of a connection. (does) 378 * 379 * IP Options (4.2.3.8) 380 * MUST ignore unsupported IP options. (does) 381 * MAY support Time Stamp and Record Route. (does) 382 * MUST allow application to specify a source route. (does) 383 * MUST allow receieved Source Route option to set route for all future 384 * segments on this connection. (does not (security issues)) 385 * 386 * ICMP messages (4.2.3.9) 387 * MUST act on ICMP errors. (does) 388 * MUST slow transmission upon receipt of a Source Quench. (does) 389 * MUST NOT abort connection upon receipt of soft Destination 390 * Unreachables (0, 1, 5), Time Exceededs and Parameter 391 * Problems. (doesn't) 392 * SHOULD report soft Destination Unreachables etc. to the 393 * application. (does) 394 * SHOULD abort connection upon receipt of hard Destination Unreachable 395 * messages (2, 3, 4). (does) 396 * 397 * Remote Address Validation (4.2.3.10) 398 * MUST reject as an error OPEN for invalid remote IP address. (does) 399 * MUST ignore SYN with invalid source address. (does) 400 * MUST silently discard incoming SYN for broadcast/multicast 401 * address. (does) 402 * 403 * Asynchronous Reports (4.2.4.1) 404 * MUST provide mechanism for reporting soft errors to application 405 * layer. (does) 406 * 407 * Type of Service (4.2.4.2) 408 * MUST allow application layer to set Type of Service. (does IP_TOS) 409 * 410 * (Whew. -- MS 950903) 411 **/ 412
413 #include <linux/config.h>
414 #include <linux/types.h>
415 #include <linux/fcntl.h>
416
417 #include <net/icmp.h>
418 #include <net/tcp.h>
419
420 #include <asm/segment.h>
421
422 unsignedlongseq_offset;
423 structtcp_mibtcp_statistics;
424
425 staticvoidtcp_close(structsock *sk, unsignedlongtimeout);
426
427 /* 428 * The less said about this the better, but it works and will do for 1.2 (and 1.4 ;)) 429 */ 430
431 structwait_queue *master_select_wakeup;
432
433 /* 434 * Find someone to 'accept'. Must be called with 435 * the socket locked or with interrupts disabled 436 */ 437
438 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 439 { 440 structsk_buff *p=skb_peek(&s->receive_queue);
441 if(p==NULL)
442 returnNULL;
443 do 444 { 445 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
446 returnp;
447 p=p->next;
448 } 449 while(p!=(structsk_buff *)&s->receive_queue);
450 returnNULL;
451 } 452
453 /* 454 * Remove a completed connection and return it. This is used by 455 * tcp_accept() to get connections from the queue. 456 */ 457
458 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 459 { 460 structsk_buff *skb;
461 unsignedlongflags;
462 save_flags(flags);
463 cli();
464 skb=tcp_find_established(s);
465 if(skb!=NULL)
466 skb_unlink(skb); /* Take it off the queue */ 467 restore_flags(flags);
468 returnskb;
469 } 470
471 /* 472 * This routine closes sockets which have been at least partially 473 * opened, but not yet accepted. Currently it is only called by 474 * tcp_close, and timeout mirrors the value there. 475 */ 476
477 staticvoidtcp_close_pending (structsock *sk)
/* */ 478 { 479 structsk_buff *skb;
480
481 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
482 { 483 skb->sk->dead=1;
484 tcp_close(skb->sk, 0);
485 kfree_skb(skb, FREE_READ);
486 } 487 return;
488 } 489
490 /* 491 * Enter the time wait state. 492 */ 493
494 voidtcp_time_wait(structsock *sk)
/* */ 495 { 496 tcp_set_state(sk,TCP_TIME_WAIT);
497 sk->shutdown = SHUTDOWN_MASK;
498 if (!sk->dead)
499 sk->state_change(sk);
500 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
501 } 502
503
504 /* 505 * This routine is called by the ICMP module when it gets some 506 * sort of error condition. If err < 0 then the socket should 507 * be closed and the error returned to the user. If err > 0 508 * it's just the icmp type << 8 | icmp code. After adjustment 509 * header points to the first 8 bytes of the tcp header. We need 510 * to find the appropriate port. 511 */ 512
513 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */ 514 __u32saddr, structinet_protocol *protocol)
515 { 516 structtcphdr *th = (structtcphdr *)header;
517 structsock *sk;
518
519 /* 520 * This one is _WRONG_. FIXME urgently. 521 */ 522 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 523 structiphdr *iph=(structiphdr *)(header-sizeof(structiphdr));
524 #endif 525 th =(structtcphdr *)header;
526 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
527
528 if (sk == NULL)
529 return;
530
531 if (type == ICMP_SOURCE_QUENCH)
532 { 533 /* 534 * FIXME: 535 * For now we will just trigger a linear backoff. 536 * The slow start code should cause a real backoff here. 537 */ 538 if (sk->cong_window > 4)
539 sk->cong_window--;
540 return;
541 } 542
543 if (type == ICMP_PARAMETERPROB)
544 { 545 sk->err=EPROTO;
546 sk->error_report(sk);
547 } 548
549 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 550 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
551 { 552 structrtable * rt;
553 /* 554 * Ugly trick to pass MTU to protocol layer. 555 * Really we should add argument "info" to error handler. 556 */ 557 unsignedshortnew_mtu = ntohs(iph->id);
558
559 if ((rt = sk->ip_route_cache) != NULL)
560 if (rt->rt_mtu > new_mtu)
561 rt->rt_mtu = new_mtu;
562
563 if (sk->mtu > new_mtu - sizeof(structiphdr) - sizeof(structtcphdr)
564 && new_mtu > sizeof(structiphdr)+sizeof(structtcphdr))
565 sk->mtu = new_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
566
567 return;
568 } 569 #endif 570
571 /* 572 * If we've already connected we will keep trying 573 * until we time out, or the user gives up. 574 */ 575
576 if (code < 13)
577 { 578 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
579 { 580 sk->err = icmp_err_convert[code].errno;
581 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
582 { 583 tcp_statistics.TcpAttemptFails++;
584 tcp_set_state(sk,TCP_CLOSE);
585 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 586 } 587 } 588 else/* Only an error on timeout */ 589 sk->err_soft = icmp_err_convert[code].errno;
590 } 591 } 592
593
594 /* 595 * Walk down the receive queue counting readable data until we hit the end or we find a gap 596 * in the received data queue (ie a frame missing that needs sending to us). Not 597 * sorting using two queues as data arrives makes life so much harder. 598 */ 599
600 staticinttcp_readable(structsock *sk)
/* */ 601 { 602 unsignedlongcounted;
603 unsignedlongamount;
604 structsk_buff *skb;
605 intsum;
606 unsignedlongflags;
607
608 if(sk && sk->debug)
609 printk("tcp_readable: %p - ",sk);
610
611 save_flags(flags);
612 cli();
613 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
614 { 615 restore_flags(flags);
616 if(sk && sk->debug)
617 printk("empty\n");
618 return(0);
619 } 620
621 counted = sk->copied_seq; /* Where we are at the moment */ 622 amount = 0;
623
624 /* 625 * Do until a push or until we are out of data. 626 */ 627
628 do 629 { 630 if (before(counted, skb->seq)) /* Found a hole so stops here */ 631 break;
632 sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 633 if (skb->h.th->syn)
634 sum++;
635 if (sum > 0)
636 {/* Add it up, move on */ 637 amount += sum;
638 if (skb->h.th->syn)
639 amount--;
640 counted += sum;
641 } 642 /* 643 * Don't count urg data ... but do it in the right place! 644 * Consider: "old_data (ptr is here) URG PUSH data" 645 * The old code would stop at the first push because 646 * it counted the urg (amount==1) and then does amount-- 647 * *after* the loop. This means tcp_readable() always 648 * returned zero if any URG PUSH was in the queue, even 649 * though there was normal data available. If we subtract 650 * the urg data right here, we even get it to work for more 651 * than one URG PUSH skb without normal data. 652 * This means that select() finally works now with urg data 653 * in the queue. Note that rlogin was never affected 654 * because it doesn't use select(); it uses two processes 655 * and a blocking read(). And the queue scan in tcp_read() 656 * was correct. Mike <pall@rz.uni-karlsruhe.de> 657 */ 658 if (skb->h.th->urg)
659 amount--; /* don't count urg data */ 660 if (amount && skb->h.th->psh) break;
661 skb = skb->next;
662 } 663 while(skb != (structsk_buff *)&sk->receive_queue);
664
665 restore_flags(flags);
666 if(sk->debug)
667 printk("got %lu bytes.\n",amount);
668 return(amount);
669 } 670
671 /* 672 * LISTEN is a special case for select.. 673 */ 674 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 675 { 676 if (sel_type == SEL_IN) { 677 intretval;
678
679 lock_sock(sk);
680 retval = (tcp_find_established(sk) != NULL);
681 release_sock(sk);
682 if (!retval)
683 select_wait(&master_select_wakeup,wait);
684 returnretval;
685 } 686 return 0;
687 } 688
689
690 /* 691 * Wait for a TCP event. 692 * 693 * Note that we don't need to lock the socket, as the upper select layers 694 * take care of normal races (between the test and the event) and we don't 695 * go look at any of the socket buffers directly. 696 */ 697 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 698 { 699 if (sk->state == TCP_LISTEN)
700 returntcp_listen_select(sk, sel_type, wait);
701
702 switch(sel_type) { 703 caseSEL_IN:
704 if (sk->err)
705 return 1;
706 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
707 break;
708
709 if (sk->shutdown & RCV_SHUTDOWN)
710 return 1;
711
712 if (sk->acked_seq == sk->copied_seq)
713 break;
714
715 if (sk->urg_seq != sk->copied_seq ||
716 sk->acked_seq != sk->copied_seq+1 ||
717 sk->urginline || !sk->urg_data)
718 return 1;
719 break;
720
721 caseSEL_OUT:
722 if (sk->err)
723 return 1;
724 if (sk->shutdown & SEND_SHUTDOWN)
725 return 0;
726 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
727 break;
728 /* 729 * This is now right thanks to a small fix 730 * by Matt Dillon. 731 */ 732
733 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
734 break;
735 return 1;
736
737 caseSEL_EX:
738 if (sk->urg_data)
739 return 1;
740 break;
741 } 742 select_wait(sk->sleep, wait);
743 return 0;
744 } 745
746 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 747 { 748 interr;
749 switch(cmd)
750 { 751
752 caseTIOCINQ:
753 #ifdef FIXME /* FIXME: */ 754 caseFIONREAD:
755 #endif 756 { 757 unsignedlongamount;
758
759 if (sk->state == TCP_LISTEN)
760 return(-EINVAL);
761
762 lock_sock(sk);
763 amount = tcp_readable(sk);
764 release_sock(sk);
765 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
766 if(err)
767 returnerr;
768 put_user(amount, (int *)arg);
769 return(0);
770 } 771 caseSIOCATMARK:
772 { 773 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
774
775 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
776 if (err)
777 returnerr;
778 put_user(answ,(int *) arg);
779 return(0);
780 } 781 caseTIOCOUTQ:
782 { 783 unsignedlongamount;
784
785 if (sk->state == TCP_LISTEN) return(-EINVAL);
786 amount = sock_wspace(sk);
787 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
788 if(err)
789 returnerr;
790 put_user(amount, (int *)arg);
791 return(0);
792 } 793 default:
794 return(-EINVAL);
795 } 796 } 797
798
799 /* 800 * This routine computes a TCP checksum. 801 * 802 * Modified January 1995 from a go-faster DOS routine by 803 * Jorge Cwik <jorge@laser.satlink.net> 804 */ 805 #undefDEBUG_TCP_CHECK 806 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */ 807 unsignedlongdaddr, intlen, structsk_buff *skb)
808 { 809 #ifdefDEBUG_TCP_CHECK 810 u16check;
811 #endif 812 th->check = 0;
813 th->check = tcp_check(th, len, saddr, daddr,
814 csum_partial((char *)th,sizeof(*th),skb->csum));
815
816 #ifdefDEBUG_TCP_CHECK 817 check = th->check;
818 th->check = 0;
819 th->check = tcp_check(th, len, saddr, daddr,
820 csum_partial((char *)th,len,0));
821 if (check != th->check) { 822 staticintcount = 0;
823 if (++count < 10) { 824 printk("Checksum %x (%x) from %p\n", th->check, check,
825 (&th)[-1]);
826 printk("TCP=<off:%d a:%d s:%d f:%d>\n", th->doff*4, th->ack, th->syn, th->fin);
827 } 828 } 829 #endif 830 } 831
832
833 /* 834 * This routine builds a generic TCP header. 835 */ 836
837 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */ 838 { 839
840 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
841 th->seq = htonl(sk->write_seq);
842 th->psh =(push == 0) ? 1 : 0;
843 sk->ack_backlog = 0;
844 sk->bytes_rcv = 0;
845 sk->ack_timed = 0;
846 th->ack_seq = htonl(sk->acked_seq);
847 sk->window = tcp_select_window(sk);
848 th->window = htons(sk->window);
849
850 return(sizeof(*th));
851 } 852
853 /* 854 * Wait for a socket to get into the connected state 855 */ 856 staticvoidwait_for_tcp_connect(structsock * sk)
/* */ 857 { 858 release_sock(sk);
859 cli();
860 if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0)
861 { 862 interruptible_sleep_on(sk->sleep);
863 } 864 sti();
865 lock_sock(sk);
866 } 867
868 /* 869 * Wait for more memory for a socket 870 */ 871 staticvoidwait_for_tcp_memory(structsock * sk)
/* */ 872 { 873 release_sock(sk);
874 cli();
875 if (sk->wmem_alloc*2 > sk->sndbuf &&
876 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
877 && sk->err == 0)
878 { 879 sk->socket->flags &= ~SO_NOSPACE;
880 interruptible_sleep_on(sk->sleep);
881 } 882 sti();
883 lock_sock(sk);
884 } 885
886
887 /* 888 * This routine copies from a user buffer into a socket, 889 * and starts the transmit system. 890 */ 891
892 staticintdo_tcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */ 893 intlen, intnonblock, intflags)
894 { 895 intcopied = 0;
896 intcopy;
897 inttmp;
898 intseglen;
899 intiovct=0;
900 structsk_buff *skb;
901 structsk_buff *send_tmp;
902 structproto *prot;
903 structdevice *dev = NULL;
904 unsignedchar *from;
905
906 /* 907 * Ok commence sending 908 */ 909
910 while(iovct<msg->msg_iovlen)
911 { 912 seglen=msg->msg_iov[iovct].iov_len;
913 from=msg->msg_iov[iovct++].iov_base;
914 prot = sk->prot;
915 while(seglen > 0)
916 { 917 /* 918 * Stop on errors 919 */ 920 if (sk->err)
921 { 922 if (copied)
923 returncopied;
924 returnsock_error(sk);
925 } 926
927 /* 928 * Make sure that we are established. 929 */ 930 if (sk->shutdown & SEND_SHUTDOWN)
931 { 932 if (copied)
933 returncopied;
934 return -EPIPE;
935 } 936
937 /* 938 * Wait for a connection to finish. 939 */ 940 while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
941 { 942 if (copied)
943 returncopied;
944
945 if (sk->err)
946 returnsock_error(sk);
947
948 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
949 { 950 if (sk->keepopen)
951 send_sig(SIGPIPE, current, 0);
952 return -EPIPE;
953 } 954
955 if (nonblock)
956 return -EAGAIN;
957
958 if (current->signal & ~current->blocked)
959 return -ERESTARTSYS;
960
961 wait_for_tcp_connect(sk);
962 } 963
964 /* 965 * The following code can result in copy <= if sk->mss is ever 966 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window). 967 * sk->mtu is constant once SYN processing is finished. I.e. we 968 * had better not get here until we've seen his SYN and at least one 969 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.) 970 * But ESTABLISHED should guarantee that. sk->max_window is by definition 971 * non-decreasing. Note that any ioctl to set user_mss must be done 972 * before the exchange of SYN's. If the initial ack from the other 973 * end has a window of 0, max_window and thus mss will both be 0. 974 */ 975
976 /* 977 * Now we need to check if we have a half built packet. 978 */ 979 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 980 /* 981 * FIXME: I'm almost sure that this fragment is BUG, 982 * but it works... I do not know why 8) --ANK 983 * 984 * Really, we should rebuild all the queues... 985 * It's difficult. Temprorary hack is to send all 986 * queued segments with allowed fragmentation. 987 */ 988 { 989 intnew_mss = min(sk->mtu, sk->max_window);
990 if (new_mss < sk->mss)
991 { 992 tcp_send_partial(sk);
993 sk->mss = new_mss;
994 } 995 } 996 #endif 997
998 if ((skb = tcp_dequeue_partial(sk)) != NULL)
999 {1000 inttcp_size;
1001
1002 tcp_size = skb->tail - (unsignedchar *)(skb->h.th + 1);
1003
1004 /* Add more stuff to the end of skb->len */1005 if (!(flags & MSG_OOB))
1006 {1007 copy = min(sk->mss - tcp_size, seglen);
1008 if (copy <= 0)
1009 {1010 printk("TCP: **bug**: \"copy\" <= 0\n");
1011 return -EFAULT;
1012 }1013 tcp_size += copy;
1014 memcpy_fromfs(skb_put(skb,copy), from, copy);
1015 skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
1016 from += copy;
1017 copied += copy;
1018 len -= copy;
1019 sk->write_seq += copy;
1020 seglen -= copy;
1021 }1022 if (tcp_size >= sk->mss || (flags & MSG_OOB) || !sk->packets_out)
1023 tcp_send_skb(sk, skb);
1024 else1025 tcp_enqueue_partial(skb, sk);
1026 continue;
1027 }1028
1029 /*1030 * We also need to worry about the window.1031 * If window < 1/2 the maximum window we've seen from this1032 * host, don't use it. This is sender side1033 * silly window prevention, as specified in RFC1122.1034 * (Note that this is different than earlier versions of1035 * SWS prevention, e.g. RFC813.). What we actually do is 1036 * use the whole MSS. Since the results in the right1037 * edge of the packet being outside the window, it will1038 * be queued for later rather than sent.1039 */1040
1041 copy = sk->window_seq - sk->write_seq;
1042 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1043 copy = sk->mss;
1044 if (copy > seglen)
1045 copy = seglen;
1046
1047 /*1048 * We should really check the window here also. 1049 */1050
1051 send_tmp = NULL;
1052 if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out)
1053 {1054 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1055 send_tmp = skb;
1056 }1057 else1058 {1059 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1060 }1061
1062 /*1063 * If we didn't get any memory, we need to sleep. 1064 */1065
1066 if (skb == NULL)
1067 {1068 sk->socket->flags |= SO_NOSPACE;
1069 if (nonblock)
1070 {1071 if (copied)
1072 returncopied;
1073 return -EAGAIN;
1074 }1075
1076 if (current->signal & ~current->blocked)
1077 {1078 if (copied)
1079 returncopied;
1080 return -ERESTARTSYS;
1081 }1082
1083 wait_for_tcp_memory(sk);
1084 continue;
1085 }1086
1087 skb->sk = sk;
1088 skb->free = 0;
1089 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1090
1091 /*1092 * FIXME: we need to optimize this.1093 * Perhaps some hints here would be good.1094 */1095
1096 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1097 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1098 if (tmp < 0 )
1099 {1100 sock_wfree(sk, skb);
1101 if (copied)
1102 return(copied);
1103 return(tmp);
1104 }1105 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1106 skb->ip_hdr->frag_off |= htons(IP_DF);
1107 #endif1108 skb->dev = dev;
1109 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
1110 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1111 if (tmp < 0)
1112 {1113 sock_wfree(sk, skb);
1114 if (copied)
1115 return(copied);
1116 return(tmp);
1117 }1118
1119 if (flags & MSG_OOB)
1120 {1121 skb->h.th->urg = 1;
1122 skb->h.th->urg_ptr = ntohs(copy);
1123 }1124
1125 skb->csum = csum_partial_copy_fromuser(from,
1126 skb_put(skb,copy), copy, 0);
1127
1128 from += copy;
1129 copied += copy;
1130 len -= copy;
1131 seglen -= copy;
1132 skb->free = 0;
1133 sk->write_seq += copy;
1134
1135 if (send_tmp != NULL)
1136 {1137 tcp_enqueue_partial(send_tmp, sk);
1138 continue;
1139 }1140 tcp_send_skb(sk, skb);
1141 }1142 }1143 sk->err = 0;
1144
1145 returncopied;
1146 }1147
1148
1149 staticinttcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */1150 intlen, intnonblock, intflags)
1151 {1152 intretval = -EINVAL;
1153
1154 /*1155 * Do sanity checking for sendmsg/sendto/send1156 */1157
1158 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1159 gotoout;
1160 if (msg->msg_name) {1161 structsockaddr_in *addr=(structsockaddr_in *)msg->msg_name;
1162
1163 if (msg->msg_namelen < sizeof(*addr))
1164 gotoout;
1165 if (addr->sin_family && addr->sin_family != AF_INET)
1166 gotoout;
1167 retval = -ENOTCONN;
1168 if(sk->state == TCP_CLOSE)
1169 gotoout;
1170 retval = -EISCONN;
1171 if (addr->sin_port != sk->dummy_th.dest)
1172 gotoout;
1173 if (addr->sin_addr.s_addr != sk->daddr)
1174 gotoout;
1175 }1176
1177 lock_sock(sk);
1178 retval = do_tcp_sendmsg(sk, msg, len, nonblock, flags);
1179
1180 /*1181 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1182 * interactive fast network servers. It's meant to be on and1183 * it really improves the throughput though not the echo time1184 * on my slow slip link - Alan1185 *1186 * If not nagling we can send on the before case too..1187 */1188
1189 if (sk->partial) {1190 if (!sk->packets_out ||
1191 (sk->nonagle && before(sk->write_seq , sk->window_seq))) {1192 tcp_send_partial(sk);
1193 }1194 }1195
1196 release_sock(sk);
1197
1198 out:
1199 returnretval;
1200 }1201
1202
1203 /*1204 * Send an ack if one is backlogged at this point. Ought to merge1205 * this with tcp_send_ack().1206 * This is called for delayed acks also.1207 */1208
1209 voidtcp_read_wakeup(structsock *sk)
/* */1210 {1211 inttmp;
1212 structdevice *dev = NULL;
1213 structtcphdr *t1;
1214 structsk_buff *buff;
1215
1216 if (!sk->ack_backlog)
1217 return;
1218
1219 /*1220 * If we're closed, don't send an ack, or we'll get a RST1221 * from the closed destination.1222 */1223 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1224 return;
1225
1226 /*1227 * FIXME: we need to put code here to prevent this routine from1228 * being called. Being called once in a while is ok, so only check1229 * if this is the second time in a row.1230 */1231
1232 /*1233 * We need to grab some memory, and put together an ack,1234 * and then put it into the queue to be sent.1235 */1236
1237 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1238 if (buff == NULL)
1239 {1240 /* Try again real soon. */1241 tcp_reset_xmit_timer(sk, TIME_WRITE, HZ);
1242 return;
1243 }1244
1245 buff->sk = sk;
1246 buff->localroute = sk->localroute;
1247 buff->csum = 0;
1248
1249 /*1250 * Put in the IP header and routing stuff. 1251 */1252
1253 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1254 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1255 if (tmp < 0)
1256 {1257 buff->free = 1;
1258 sock_wfree(sk, buff);
1259 return;
1260 }1261
1262 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1263
1264 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1265 t1->seq = htonl(sk->sent_seq);
1266
1267 sk->ack_backlog = 0;
1268 sk->bytes_rcv = 0;
1269
1270 sk->window = tcp_select_window(sk);
1271 t1->window = htons(sk->window);
1272 t1->ack_seq = htonl(sk->acked_seq);
1273 t1->doff = sizeof(*t1)/4;
1274 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), buff);
1275 sk->prot->queue_xmit(sk, dev, buff, 1);
1276 tcp_statistics.TcpOutSegs++;
1277 }1278
1279
1280 /*1281 * Handle reading urgent data. BSD has very simple semantics for1282 * this, no blocking and very strange errors 8)1283 */1284
1285 staticinttcp_recv_urg(structsock * sk, intnonblock,
/* */1286 structmsghdr *msg, intlen, intflags, int *addr_len)
1287 {1288 /*1289 * No URG data to read1290 */1291 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1292 return -EINVAL; /* Yes this is right ! */1293
1294 if (sk->err)
1295 returnsock_error(sk);
1296
1297 if (sk->state == TCP_CLOSE || sk->done)
1298 {1299 if (!sk->done)
1300 {1301 sk->done = 1;
1302 return 0;
1303 }1304 return -ENOTCONN;
1305 }1306
1307 if (sk->shutdown & RCV_SHUTDOWN)
1308 {1309 sk->done = 1;
1310 return 0;
1311 }1312 lock_sock(sk);
1313 if (sk->urg_data & URG_VALID)
1314 {1315 charc = sk->urg_data;
1316 if (!(flags & MSG_PEEK))
1317 sk->urg_data = URG_READ;
1318 memcpy_toiovec(msg->msg_iov, &c, 1);
1319 if(msg->msg_name)
1320 {1321 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
1322 sin->sin_family=AF_INET;
1323 sin->sin_addr.s_addr=sk->daddr;
1324 sin->sin_port=sk->dummy_th.dest;
1325 }1326 if(addr_len)
1327 *addr_len=sizeof(structsockaddr_in);
1328 release_sock(sk);
1329 return 1;
1330 }1331 release_sock(sk);
1332
1333 /*1334 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and1335 * the available implementations agree in this case:1336 * this call should never block, independent of the1337 * blocking state of the socket.1338 * Mike <pall@rz.uni-karlsruhe.de>1339 */1340 return -EAGAIN;
1341 }1342
1343 /*1344 * Release a skb if it is no longer needed. This routine1345 * must be called with interrupts disabled or with the1346 * socket locked so that the sk_buff queue operation is ok.1347 */1348
1349 staticinlinevoidtcp_eat_skb(structsock *sk, structsk_buff * skb)
/* */1350 {1351 sk->ack_backlog++;
1352 skb->sk = sk;
1353 __skb_unlink(skb, &sk->receive_queue);
1354 kfree_skb(skb, FREE_READ);
1355 }1356
1357 /*1358 * FIXME:1359 * This routine frees used buffers.1360 * It should consider sending an ACK to let the1361 * other end know we now have a bigger window.1362 */1363
1364 staticvoidcleanup_rbuf(structsock *sk)
/* */1365 {1366 structsk_buff *skb;
1367 unsignedlongrspace;
1368
1369 /*1370 * NOTE! The socket must be locked, so that we don't get1371 * a messed-up receive queue.1372 */1373 while ((skb=skb_peek(&sk->receive_queue)) != NULL) {1374 if (!skb->used || skb->users)
1375 break;
1376 tcp_eat_skb(sk, skb);
1377 }1378
1379 /*1380 * FIXME:1381 * At this point we should send an ack if the difference1382 * in the window, and the amount of space is bigger than1383 * TCP_WINDOW_DIFF.1384 */1385
1386 rspace=sock_rspace(sk);
1387 if(sk->debug)
1388 printk("sk->rspace = %lu\n", rspace);
1389 /*1390 * This area has caused the most trouble. The current strategy1391 * is to simply do nothing if the other end has room to send at1392 * least 3 full packets, because the ack from those will auto-1393 * matically update the window. If the other end doesn't think1394 * we have much space left, but we have room for at least 1 more1395 * complete packet than it thinks we do, we will send an ack1396 * immediately. Otherwise we will wait up to .5 seconds in case1397 * the user reads some more.1398 */1399
1400 /*1401 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1402 * if the other end is offering a window smaller than the agreed on MSS1403 * (called sk->mtu here). In theory there's no connection between send1404 * and receive, and so no reason to think that they're going to send1405 * small packets. For the moment I'm using the hack of reducing the mss1406 * only on the send side, so I'm putting mtu here.1407 */1408
1409 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1410 {1411 /* Send an ack right now. */1412 tcp_read_wakeup(sk);
1413 }1414 else1415 {1416 /* Force it to send an ack soon. */1417 intwas_active = del_timer(&sk->retransmit_timer);
1418 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
1419 {1420 tcp_reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1421 }1422 else1423 add_timer(&sk->retransmit_timer);
1424 }1425 }1426
1427
1428 /*1429 * This routine copies from a sock struct into the user buffer. 1430 */1431
1432 staticinttcp_recvmsg(structsock *sk, structmsghdr *msg,
/* */1433 intlen, intnonblock, intflags, int *addr_len)
1434 {1435 structwait_queuewait = {current, NULL};
1436 intcopied = 0;
1437 u32peek_seq;
1438 volatileu32 *seq; /* So gcc doesn't overoptimise */1439 unsignedlongused;
1440
1441 /* 1442 * This error should be checked. 1443 */1444
1445 if (sk->state == TCP_LISTEN)
1446 return -ENOTCONN;
1447
1448 /*1449 * Urgent data needs to be handled specially. 1450 */1451
1452 if (flags & MSG_OOB)
1453 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1454
1455 /*1456 * Copying sequence to update. This is volatile to handle1457 * the multi-reader case neatly (memcpy_to/fromfs might be 1458 * inline and thus not flush cached variables otherwise).1459 */1460
1461 peek_seq = sk->copied_seq;
1462 seq = &sk->copied_seq;
1463 if (flags & MSG_PEEK)
1464 seq = &peek_seq;
1465
1466 add_wait_queue(sk->sleep, &wait);
1467 lock_sock(sk);
1468 while (len > 0)
1469 {1470 structsk_buff * skb;
1471 u32offset;
1472
1473 /*1474 * Are we at urgent data? Stop if we have read anything.1475 */1476
1477 if (copied && sk->urg_data && sk->urg_seq == *seq)
1478 break;
1479
1480 /*1481 * Next get a buffer.1482 */1483
1484 current->state = TASK_INTERRUPTIBLE;
1485
1486 skb = skb_peek(&sk->receive_queue);
1487 do1488 {1489 if (!skb)
1490 break;
1491 if (before(*seq, skb->seq))
1492 break;
1493 offset = *seq - skb->seq;
1494 if (skb->h.th->syn)
1495 offset--;
1496 if (offset < skb->len)
1497 gotofound_ok_skb;
1498 if (skb->h.th->fin)
1499 gotofound_fin_ok;
1500 if (!(flags & MSG_PEEK))
1501 skb->used = 1;
1502 skb = skb->next;
1503 }1504 while (skb != (structsk_buff *)&sk->receive_queue);
1505
1506 if (copied)
1507 break;
1508
1509 if (sk->err)
1510 {1511 copied = sock_error(sk);
1512 break;
1513 }1514
1515 if (sk->state == TCP_CLOSE)
1516 {1517 if (!sk->done)
1518 {1519 sk->done = 1;
1520 break;
1521 }1522 copied = -ENOTCONN;
1523 break;
1524 }1525
1526 if (sk->shutdown & RCV_SHUTDOWN)
1527 {1528 sk->done = 1;
1529 break;
1530 }1531
1532 if (nonblock)
1533 {1534 copied = -EAGAIN;
1535 break;
1536 }1537
1538 cleanup_rbuf(sk);
1539 release_sock(sk);
1540 sk->socket->flags |= SO_WAITDATA;
1541 schedule();
1542 sk->socket->flags &= ~SO_WAITDATA;
1543 lock_sock(sk);
1544
1545 if (current->signal & ~current->blocked)
1546 {1547 copied = -ERESTARTSYS;
1548 break;
1549 }1550 continue;
1551
1552 found_ok_skb:
1553 /*1554 * Lock the buffer. We can be fairly relaxed as1555 * an interrupt will never steal a buffer we are 1556 * using unless I've missed something serious in1557 * tcp_data.1558 */1559
1560 skb->users++;
1561
1562 /*1563 * Ok so how much can we use ? 1564 */1565
1566 used = skb->len - offset;
1567 if (len < used)
1568 used = len;
1569 /*1570 * Do we have urgent data here? 1571 */1572
1573 if (sk->urg_data)
1574 {1575 u32urg_offset = sk->urg_seq - *seq;
1576 if (urg_offset < used)
1577 {1578 if (!urg_offset)
1579 {1580 if (!sk->urginline)
1581 {1582 ++*seq;
1583 offset++;
1584 used--;
1585 }1586 }1587 else1588 used = urg_offset;
1589 }1590 }1591
1592 /*1593 * Copy it - We _MUST_ update *seq first so that we1594 * don't ever double read when we have dual readers1595 */1596
1597 *seq += used;
1598
1599 /*1600 * This memcpy_tofs can sleep. If it sleeps and we1601 * do a second read it relies on the skb->users to avoid1602 * a crash when cleanup_rbuf() gets called.1603 */1604
1605 memcpy_toiovec(msg->msg_iov,((unsignedchar *)skb->h.th) +
1606 skb->h.th->doff*4 + offset, used);
1607 copied += used;
1608 len -= used;
1609
1610 /*1611 * We now will not sleep again until we are finished1612 * with skb. Sorry if you are doing the SMP port1613 * but you'll just have to fix it neatly ;)1614 */1615
1616 skb->users --;
1617
1618 if (after(sk->copied_seq,sk->urg_seq))
1619 sk->urg_data = 0;
1620 if (used + offset < skb->len)
1621 continue;
1622
1623 /*1624 * Process the FIN.1625 */1626
1627 if (skb->h.th->fin)
1628 gotofound_fin_ok;
1629 if (flags & MSG_PEEK)
1630 continue;
1631 skb->used = 1;
1632 if (!skb->users)
1633 tcp_eat_skb(sk, skb);
1634 continue;
1635
1636 found_fin_ok:
1637 ++*seq;
1638 if (flags & MSG_PEEK)
1639 break;
1640
1641 /*1642 * All is done1643 */1644
1645 skb->used = 1;
1646 sk->shutdown |= RCV_SHUTDOWN;
1647 break;
1648
1649 }1650
1651 if(copied>0 && msg->msg_name)
1652 {1653 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
1654 sin->sin_family=AF_INET;
1655 sin->sin_addr.s_addr=sk->daddr;
1656 sin->sin_port=sk->dummy_th.dest;
1657 }1658 if(addr_len)
1659 *addr_len=sizeof(structsockaddr_in);
1660
1661 remove_wait_queue(sk->sleep, &wait);
1662 current->state = TASK_RUNNING;
1663
1664 /* Clean up data we have read: This will do ACK frames */1665 cleanup_rbuf(sk);
1666 release_sock(sk);
1667 returncopied;
1668 }1669
1670
1671
1672 /*1673 * State processing on a close. This implements the state shift for1674 * sending our FIN frame. Note that we only send a FIN for some 1675 * states. A shutdown() may have already sent the FIN, or we may be1676 * closed.1677 */1678
1679 staticinttcp_close_state(structsock *sk, intdead)
/* */1680 {1681 intns=TCP_CLOSE;
1682 intsend_fin=0;
1683 switch(sk->state)
1684 {1685 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */1686 break;
1687 caseTCP_SYN_RECV:
1688 caseTCP_ESTABLISHED: /* Closedown begin */1689 ns=TCP_FIN_WAIT1;
1690 send_fin=1;
1691 break;
1692 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */1693 caseTCP_FIN_WAIT2:
1694 caseTCP_CLOSING:
1695 ns=sk->state;
1696 break;
1697 caseTCP_CLOSE:
1698 caseTCP_LISTEN:
1699 break;
1700 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and1701 wait only for the ACK */1702 ns=TCP_LAST_ACK;
1703 send_fin=1;
1704 }1705
1706 tcp_set_state(sk,ns);
1707
1708 /*1709 * This is a (useful) BSD violating of the RFC. There is a1710 * problem with TCP as specified in that the other end could1711 * keep a socket open forever with no application left this end.1712 * We use a 3 minute timeout (about the same as BSD) then kill1713 * our end. If they send after that then tough - BUT: long enough1714 * that we won't make the old 4*rto = almost no time - whoops1715 * reset mistake.1716 */1717 if(dead && ns==TCP_FIN_WAIT2)
1718 {1719 inttimer_active=del_timer(&sk->timer);
1720 if(timer_active)
1721 add_timer(&sk->timer);
1722 else1723 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
1724 }1725
1726 returnsend_fin;
1727 }1728
1729 /*1730 * Shutdown the sending side of a connection. Much like close except1731 * that we don't receive shut down or set sk->dead=1.1732 */1733
1734 voidtcp_shutdown(structsock *sk, inthow)
/* */1735 {1736 /*1737 * We need to grab some memory, and put together a FIN,1738 * and then put it into the queue to be sent.1739 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.1740 */1741
1742 if (!(how & SEND_SHUTDOWN))
1743 return;
1744
1745 /*1746 * If we've already sent a FIN, or it's a closed state1747 */1748
1749 if (sk->state == TCP_FIN_WAIT1 ||
1750 sk->state == TCP_FIN_WAIT2 ||
1751 sk->state == TCP_CLOSING ||
1752 sk->state == TCP_LAST_ACK ||
1753 sk->state == TCP_TIME_WAIT ||
1754 sk->state == TCP_CLOSE ||
1755 sk->state == TCP_LISTEN1756 )
1757 {1758 return;
1759 }1760 lock_sock(sk);
1761
1762 /*1763 * flag that the sender has shutdown1764 */1765
1766 sk->shutdown |= SEND_SHUTDOWN;
1767
1768 /*1769 * Clear out any half completed packets. 1770 */1771
1772 if (sk->partial)
1773 tcp_send_partial(sk);
1774
1775 /*1776 * FIN if needed1777 */1778
1779 if (tcp_close_state(sk,0))
1780 tcp_send_fin(sk);
1781
1782 release_sock(sk);
1783 }1784
1785
1786 /*1787 * Return 1 if we still have things to send in our buffers.1788 */1789
1790 staticinlineintclosing(structsock * sk)
/* */1791 {1792 switch (sk->state) {1793 caseTCP_FIN_WAIT1:
1794 caseTCP_CLOSING:
1795 caseTCP_LAST_ACK:
1796 return 1;
1797 }1798 return 0;
1799 }1800
1801
1802 staticvoidtcp_close(structsock *sk, unsignedlongtimeout)
/* */1803 {1804 structsk_buff *skb;
1805
1806 /*1807 * We need to grab some memory, and put together a FIN, 1808 * and then put it into the queue to be sent.1809 */1810
1811 lock_sock(sk);
1812
1813 tcp_cache_zap();
1814 if(sk->state == TCP_LISTEN)
1815 {1816 /* Special case */1817 tcp_set_state(sk, TCP_CLOSE);
1818 tcp_close_pending(sk);
1819 release_sock(sk);
1820 return;
1821 }1822
1823 sk->keepopen = 1;
1824 sk->shutdown = SHUTDOWN_MASK;
1825
1826 if (!sk->dead)
1827 sk->state_change(sk);
1828
1829 /*1830 * We need to flush the recv. buffs. We do this only on the1831 * descriptor close, not protocol-sourced closes, because the1832 * reader process may not have drained the data yet!1833 */1834
1835 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
1836 kfree_skb(skb, FREE_READ);
1837
1838 /*1839 * Get rid off any half-completed packets. 1840 */1841
1842 if (sk->partial)
1843 tcp_send_partial(sk);
1844
1845 /*1846 * Timeout is not the same thing - however the code likes1847 * to send both the same way (sigh).1848 */1849
1850 if (tcp_close_state(sk,1)==1)
1851 {1852 tcp_send_fin(sk);
1853 }1854
1855 if (timeout) {1856 cli();
1857 release_sock(sk);
1858 current->timeout = timeout;
1859 while(closing(sk) && current->timeout)
1860 {1861 interruptible_sleep_on(sk->sleep);
1862 if (current->signal & ~current->blocked)
1863 {1864 break;
1865 }1866 }1867 current->timeout=0;
1868 lock_sock(sk);
1869 sti();
1870 }1871
1872 /*1873 * This will destroy it. The timers will take care of actually1874 * free'ing up the memory.1875 */1876 sk->dead = 1;
1877 tcp_cache_zap(); /* Kill the cache again. */1878 release_sock(sk);
1879 }1880
1881
1882 /*1883 * This will accept the next outstanding connection. 1884 */1885
1886 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */1887 {1888 structsock *newsk;
1889 structsk_buff *skb;
1890
1891 /*1892 * We need to make sure that this socket is listening,1893 * and that it has something pending.1894 */1895
1896 if (sk->state != TCP_LISTEN)
1897 {1898 sk->err = EINVAL;
1899 return(NULL);
1900 }1901
1902 /* Avoid the race. */1903 cli();
1904 lock_sock(sk);
1905
1906 while((skb = tcp_dequeue_established(sk)) == NULL)
1907 {1908 if (flags & O_NONBLOCK)
1909 {1910 sti();
1911 release_sock(sk);
1912 sk->err = EAGAIN;
1913 return(NULL);
1914 }1915
1916 release_sock(sk);
1917 interruptible_sleep_on(sk->sleep);
1918 if (current->signal & ~current->blocked)
1919 {1920 sti();
1921 sk->err = ERESTARTSYS;
1922 return(NULL);
1923 }1924 lock_sock(sk);
1925 }1926 sti();
1927
1928 /*1929 * Now all we need to do is return skb->sk. 1930 */1931
1932 newsk = skb->sk;
1933
1934 kfree_skb(skb, FREE_READ);
1935 sk->ack_backlog--;
1936 release_sock(sk);
1937 return(newsk);
1938 }1939
1940 /*1941 * This will initiate an outgoing connection. 1942 */1943
1944 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */1945 {1946 structsk_buff *buff;
1947 structdevice *dev=NULL;
1948 unsignedchar *ptr;
1949 inttmp;
1950 intatype;
1951 structtcphdr *t1;
1952 structrtable *rt;
1953
1954 if (sk->state != TCP_CLOSE)
1955 return(-EISCONN);
1956
1957 /*1958 * Don't allow a double connect.1959 */1960
1961 if(sk->daddr)
1962 return -EINVAL;
1963
1964 if (addr_len < 8)
1965 return(-EINVAL);
1966
1967 if (usin->sin_family && usin->sin_family != AF_INET)
1968 return(-EAFNOSUPPORT);
1969
1970 /*1971 * connect() to INADDR_ANY means loopback (BSD'ism).1972 */1973
1974 if(usin->sin_addr.s_addr==INADDR_ANY)
1975 usin->sin_addr.s_addr=ip_my_addr();
1976
1977 /*1978 * Don't want a TCP connection going to a broadcast address 1979 */1980
1981 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
1982 return -ENETUNREACH;
1983
1984 lock_sock(sk);
1985 sk->daddr = usin->sin_addr.s_addr;
1986 sk->write_seq = tcp_init_seq();
1987 sk->window_seq = sk->write_seq;
1988 sk->rcv_ack_seq = sk->write_seq -1;
1989 sk->err = 0;
1990 sk->dummy_th.dest = usin->sin_port;
1991 release_sock(sk);
1992
1993 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
1994 if (buff == NULL)
1995 {1996 return(-ENOMEM);
1997 }1998 lock_sock(sk);
1999 buff->sk = sk;
2000 buff->free = 0;
2001 buff->localroute = sk->localroute;
2002
2003
2004 /*2005 * Put in the IP header and routing stuff.2006 */2007
2008 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2009 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2010 if (tmp < 0)
2011 {2012 sock_wfree(sk, buff);
2013 release_sock(sk);
2014 return(-ENETUNREACH);
2015 }2016 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
2017 sk->saddr = rt->rt_src;
2018 sk->rcv_saddr = sk->saddr;
2019
2020 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
2021
2022 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
2023 buff->seq = sk->write_seq++;
2024 t1->seq = htonl(buff->seq);
2025 sk->sent_seq = sk->write_seq;
2026 buff->end_seq = sk->write_seq;
2027 t1->ack = 0;
2028 t1->window = 2;
2029 t1->syn = 1;
2030 t1->doff = 6;
2031 /* use 512 or whatever user asked for */2032
2033 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2034 sk->window_clamp=rt->rt_window;
2035 else2036 sk->window_clamp=0;
2037
2038 if (sk->user_mss)
2039 sk->mtu = sk->user_mss;
2040 elseif (rt)
2041 sk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
2042 else2043 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
2044
2045 /*2046 * but not bigger than device MTU 2047 */2048
2049 if(sk->mtu <32)
2050 sk->mtu = 32; /* Sanity limit */2051
2052 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
2053
2054 #ifdefCONFIG_SKIP2055
2056 /*2057 * SKIP devices set their MTU to 65535. This is so they can take packets2058 * unfragmented to security process then fragment. They could lie to the2059 * TCP layer about a suitable MTU, but its easier to let skip sort it out2060 * simply because the final package we want unfragmented is going to be2061 *2062 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]2063 */2064
2065 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */2066 sk->mtu=skip_pick_mtu(sk->mtu,dev);
2067 #endif2068
2069 /*2070 * Put in the TCP options to say MTU. 2071 */2072
2073 ptr = skb_put(buff,4);
2074 ptr[0] = 2;
2075 ptr[1] = 4;
2076 ptr[2] = (sk->mtu) >> 8;
2077 ptr[3] = (sk->mtu) & 0xff;
2078 buff->csum = csum_partial(ptr, 4, 0);
2079 tcp_send_check(t1, sk->saddr, sk->daddr,
2080 sizeof(structtcphdr) + 4, buff);
2081
2082 /*2083 * This must go first otherwise a really quick response will get reset. 2084 */2085
2086 tcp_cache_zap();
2087 tcp_set_state(sk,TCP_SYN_SENT);
2088 if(rt&&rt->rt_flags&RTF_IRTT)
2089 sk->rto = rt->rt_irtt;
2090 else2091 sk->rto = TCP_TIMEOUT_INIT;
2092 sk->retransmit_timer.function=&tcp_retransmit_timer;
2093 sk->retransmit_timer.data = (unsignedlong)sk;
2094 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */2095 sk->retransmits = 0; /* Now works the right way instead of a hacked 2096 initial setting */2097
2098 sk->prot->queue_xmit(sk, dev, buff, 0);
2099 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2100 tcp_statistics.TcpActiveOpens++;
2101 tcp_statistics.TcpOutSegs++;
2102
2103 release_sock(sk);
2104 return(0);
2105 }2106
2107 /*2108 * Socket option code for TCP. 2109 */2110
2111 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */2112 {2113 intval,err;
2114
2115 if(level!=SOL_TCP)
2116 returnip_setsockopt(sk,level,optname,optval,optlen);
2117
2118 if (optval == NULL)
2119 return(-EINVAL);
2120
2121 err=verify_area(VERIFY_READ, optval, sizeof(int));
2122 if(err)
2123 returnerr;
2124
2125 val = get_user((int *)optval);
2126
2127 switch(optname)
2128 {2129 caseTCP_MAXSEG:
2130 /*2131 * values greater than interface MTU won't take effect. however at2132 * the point when this call is done we typically don't yet know2133 * which interface is going to be used2134 */2135 if(val<1||val>MAX_WINDOW)
2136 return -EINVAL;
2137 sk->user_mss=val;
2138 return 0;
2139 caseTCP_NODELAY:
2140 sk->nonagle=(val==0)?0:1;
2141 return 0;
2142 default:
2143 return(-ENOPROTOOPT);
2144 }2145 }2146
2147 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */2148 {2149 intval,err;
2150
2151 if(level!=SOL_TCP)
2152 returnip_getsockopt(sk,level,optname,optval,optlen);
2153
2154 switch(optname)
2155 {2156 caseTCP_MAXSEG:
2157 val=sk->user_mss;
2158 break;
2159 caseTCP_NODELAY:
2160 val=sk->nonagle;
2161 break;
2162 default:
2163 return(-ENOPROTOOPT);
2164 }2165 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2166 if(err)
2167 returnerr;
2168 put_user(sizeof(int),(int *) optlen);
2169
2170 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
2171 if(err)
2172 returnerr;
2173 put_user(val,(int *)optval);
2174
2175 return(0);
2176 }2177
2178
2179 structprototcp_prot = {2180 tcp_close,
2181 ip_build_header,
2182 tcp_connect,
2183 tcp_accept,
2184 ip_queue_xmit,
2185 tcp_retransmit,
2186 tcp_write_wakeup,
2187 tcp_read_wakeup,
2188 tcp_rcv,
2189 tcp_select,
2190 tcp_ioctl,
2191 NULL,
2192 tcp_shutdown,
2193 tcp_setsockopt,
2194 tcp_getsockopt,
2195 tcp_sendmsg,
2196 tcp_recvmsg,
2197 NULL, /* No special bind() */2198 128,
2199 0,
2200 "TCP",
2201 0, 0,
2202 {NULL,}2203 };