1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_send_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but it's a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications. 178 * Alan Cox : rcv_saddr errors. 179 * Alan Cox : Block double connect(). 180 * Alan Cox : Small hooks for enSKIP. 181 * Alexey Kuznetsov: Path MTU discovery. 182 * Alan Cox : Support soft errors. 183 * Alan Cox : Fix MTU discovery pathological case 184 * when the remote claims no mtu! 185 * Marc Tamsky : TCP_CLOSE fix. 186 * Colin (G3TNE) : Send a reset on syn ack replies in 187 * window but wrong (fixes NT lpd problems) 188 * Pedro Roque : Better TCP window handling, delayed ack. 189 * Joerg Reuter : No modification of locked buffers in 190 * tcp_do_retransmit() 191 * Eric Schenk : Changed receiver side silly window 192 * avoidance algorithm to BSD style 193 * algorithm. This doubles throughput 194 * against machines running Solaris, 195 * and seems to result in general 196 * improvement. 197 * Eric Schenk : Changed receiver side silly window 198 * avoidance algorithm to BSD style 199 * algorithm. This doubles throughput 200 * against machines running Solaris, 201 * and seems to result in general 202 * improvement. 203 * 204 * To Fix: 205 * Fast path the code. Two things here - fix the window calculation 206 * so it doesn't iterate over the queue, also spot packets with no funny 207 * options arriving in order and process directly. 208 * 209 * Rewrite output state machine to use a single queue. 210 * Speed up input assembly algorithm. 211 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 212 * could do with it working on IPv4 213 * User settable/learned rtt/max window/mtu 214 * 215 * Change the fundamental structure to a single send queue maintained 216 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 217 * active routes too]). Cut the queue off in tcp_retransmit/ 218 * tcp_transmit. 219 * Change the receive queue to assemble as it goes. This lets us 220 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 221 * tcp_data/tcp_read as well as the window shrink crud. 222 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 223 * tcp_queue_skb seem obvious routines to extract. 224 * 225 * This program is free software; you can redistribute it and/or 226 * modify it under the terms of the GNU General Public License 227 * as published by the Free Software Foundation; either version 228 * 2 of the License, or(at your option) any later version. 229 * 230 * Description of States: 231 * 232 * TCP_SYN_SENT sent a connection request, waiting for ack 233 * 234 * TCP_SYN_RECV received a connection request, sent ack, 235 * waiting for final ack in three-way handshake. 236 * 237 * TCP_ESTABLISHED connection established 238 * 239 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 240 * transmission of remaining buffered data 241 * 242 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 243 * to shutdown 244 * 245 * TCP_CLOSING both sides have shutdown but we still have 246 * data we have to finish sending 247 * 248 * TCP_TIME_WAIT timeout to catch resent junk before entering 249 * closed, can only be entered from FIN_WAIT2 250 * or CLOSING. Required because the other end 251 * may not have gotten our last ACK causing it 252 * to retransmit the data packet (which we ignore) 253 * 254 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 255 * us to finish writing our data and to shutdown 256 * (we have to close() to move on to LAST_ACK) 257 * 258 * TCP_LAST_ACK out side has shutdown after remote has 259 * shutdown. There may still be data in our 260 * buffer that we have to finish sending 261 * 262 * TCP_CLOSE socket is finished 263 */ 264
265 /* 266 * RFC1122 status: 267 * NOTE: I'm not going to be doing comments in the code for this one except 268 * for violations and the like. tcp.c is just too big... If I say something 269 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 270 * with Alan. -- MS 950903 271 * 272 * Use of PSH (4.2.2.2) 273 * MAY aggregate data sent without the PSH flag. (does) 274 * MAY queue data received without the PSH flag. (does) 275 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 276 * MAY implement PSH on send calls. (doesn't, thus:) 277 * MUST NOT buffer data indefinitely (doesn't [1 second]) 278 * MUST set PSH on last segment (does) 279 * MAY pass received PSH to application layer (doesn't) 280 * SHOULD send maximum-sized segment whenever possible. (almost always does) 281 * 282 * Window Size (4.2.2.3, 4.2.2.16) 283 * MUST treat window size as an unsigned number (does) 284 * SHOULD treat window size as a 32-bit number (does not) 285 * MUST NOT shrink window once it is offered (does not normally) 286 * 287 * Urgent Pointer (4.2.2.4) 288 * **MUST point urgent pointer to last byte of urgent data (not right 289 * after). (doesn't, to be like BSD) 290 * MUST inform application layer asynchronously of incoming urgent 291 * data. (does) 292 * MUST provide application with means of determining the amount of 293 * urgent data pending. (does) 294 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 295 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 296 * [Follows BSD 1 byte of urgent data] 297 * 298 * TCP Options (4.2.2.5) 299 * MUST be able to receive TCP options in any segment. (does) 300 * MUST ignore unsupported options (does) 301 * 302 * Maximum Segment Size Option (4.2.2.6) 303 * MUST implement both sending and receiving MSS. (does) 304 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send 305 * it always). (does, even when MSS == 536, which is legal) 306 * MUST assume MSS == 536 if no MSS received at connection setup (does) 307 * MUST calculate "effective send MSS" correctly: 308 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 309 * (does - but allows operator override) 310 * 311 * TCP Checksum (4.2.2.7) 312 * MUST generate and check TCP checksum. (does) 313 * 314 * Initial Sequence Number Selection (4.2.2.8) 315 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 316 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 317 * necessary for 10Mbps networks - and harder than BSD to spoof!) 318 * 319 * Simultaneous Open Attempts (4.2.2.10) 320 * MUST support simultaneous open attempts (does) 321 * 322 * Recovery from Old Duplicate SYN (4.2.2.11) 323 * MUST keep track of active vs. passive open (does) 324 * 325 * RST segment (4.2.2.12) 326 * SHOULD allow an RST segment to contain data (does, but doesn't do 327 * anything with it, which is standard) 328 * 329 * Closing a Connection (4.2.2.13) 330 * MUST inform application of whether connection was closed by RST or 331 * normal close. (does) 332 * MAY allow "half-duplex" close (treat connection as closed for the 333 * local app, even before handshake is done). (does) 334 * MUST linger in TIME_WAIT for 2 * MSL (does) 335 * 336 * Retransmission Timeout (4.2.2.15) 337 * MUST implement Jacobson's slow start and congestion avoidance 338 * stuff. (does) 339 * 340 * Probing Zero Windows (4.2.2.17) 341 * MUST support probing of zero windows. (does) 342 * MAY keep offered window closed indefinitely. (does) 343 * MUST allow remote window to stay closed indefinitely. (does) 344 * 345 * Passive Open Calls (4.2.2.18) 346 * MUST NOT let new passive open affect other connections. (doesn't) 347 * MUST support passive opens (LISTENs) concurrently. (does) 348 * 349 * Time to Live (4.2.2.19) 350 * MUST make TCP TTL configurable. (does - IP_TTL option) 351 * 352 * Event Processing (4.2.2.20) 353 * SHOULD queue out-of-order segments. (does) 354 * MUST aggregate ACK segments whenever possible. (does but badly) 355 * 356 * Retransmission Timeout Calculation (4.2.3.1) 357 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 358 * calculation. (does, or at least explains them in the comments 8*b) 359 * SHOULD initialize RTO to 0 and RTT to 3. (does) 360 * 361 * When to Send an ACK Segment (4.2.3.2) 362 * SHOULD implement delayed ACK. (does) 363 * MUST keep ACK delay < 0.5 sec. (does) 364 * 365 * When to Send a Window Update (4.2.3.3) 366 * MUST implement receiver-side SWS. (does) 367 * 368 * When to Send Data (4.2.3.4) 369 * MUST implement sender-side SWS. (does) 370 * SHOULD implement Nagle algorithm. (does) 371 * 372 * TCP Connection Failures (4.2.3.5) 373 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 374 * SHOULD inform application layer of soft errors. (does) 375 * 376 * TCP Keep-Alives (4.2.3.6) 377 * MAY provide keep-alives. (does) 378 * MUST make keep-alives configurable on a per-connection basis. (does) 379 * MUST default to no keep-alives. (does) 380 * **MUST make keep-alive interval configurable. (doesn't) 381 * **MUST make default keep-alive interval > 2 hours. (doesn't) 382 * MUST NOT interpret failure to ACK keep-alive packet as dead 383 * connection. (doesn't) 384 * SHOULD send keep-alive with no data. (does) 385 * 386 * TCP Multihoming (4.2.3.7) 387 * MUST get source address from IP layer before sending first 388 * SYN. (does) 389 * MUST use same local address for all segments of a connection. (does) 390 * 391 * IP Options (4.2.3.8) 392 * MUST ignore unsupported IP options. (does) 393 * MAY support Time Stamp and Record Route. (does) 394 * MUST allow application to specify a source route. (does) 395 * MUST allow received Source Route option to set route for all future 396 * segments on this connection. (does not (security issues)) 397 * 398 * ICMP messages (4.2.3.9) 399 * MUST act on ICMP errors. (does) 400 * MUST slow transmission upon receipt of a Source Quench. (does) 401 * MUST NOT abort connection upon receipt of soft Destination 402 * Unreachables (0, 1, 5), Time Exceededs and Parameter 403 * Problems. (doesn't) 404 * SHOULD report soft Destination Unreachables etc. to the 405 * application. (does) 406 * SHOULD abort connection upon receipt of hard Destination Unreachable 407 * messages (2, 3, 4). (does) 408 * 409 * Remote Address Validation (4.2.3.10) 410 * MUST reject as an error OPEN for invalid remote IP address. (does) 411 * MUST ignore SYN with invalid source address. (does) 412 * MUST silently discard incoming SYN for broadcast/multicast 413 * address. (does) 414 * 415 * Asynchronous Reports (4.2.4.1) 416 * MUST provide mechanism for reporting soft errors to application 417 * layer. (does) 418 * 419 * Type of Service (4.2.4.2) 420 * MUST allow application layer to set Type of Service. (does IP_TOS) 421 * 422 * (Whew. -- MS 950903) 423 **/ 424
425 #include <linux/config.h>
426 #include <linux/types.h>
427 #include <linux/fcntl.h>
428
429 #include <net/icmp.h>
430 #include <net/tcp.h>
431
432 #include <asm/segment.h>
433
434 unsignedlongseq_offset;
435 structtcp_mibtcp_statistics;
436
437 staticvoidtcp_close(structsock *sk, unsignedlongtimeout);
438
439 /* 440 * Find someone to 'accept'. Must be called with 441 * the socket locked or with interrupts disabled 442 */ 443
444 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 445 { 446 structsk_buff *p=skb_peek(&s->receive_queue);
447 if(p==NULL)
448 returnNULL;
449 do 450 { 451 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
452 returnp;
453 p=p->next;
454 } 455 while(p!=(structsk_buff *)&s->receive_queue);
456 returnNULL;
457 } 458
459 /* 460 * This routine closes sockets which have been at least partially 461 * opened, but not yet accepted. Currently it is only called by 462 * tcp_close, and timeout mirrors the value there. 463 */ 464
465 staticvoidtcp_close_pending (structsock *sk)
/* */ 466 { 467 structsk_buff *skb;
468
469 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
470 { 471 tcp_close(skb->sk, 0);
472 kfree_skb(skb, FREE_READ);
473 } 474 return;
475 } 476
477 /* 478 * Enter the time wait state. 479 */ 480
481 voidtcp_time_wait(structsock *sk)
/* */ 482 { 483 tcp_set_state(sk,TCP_TIME_WAIT);
484 sk->shutdown = SHUTDOWN_MASK;
485 if (!sk->dead)
486 sk->state_change(sk);
487 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
488 } 489
490
491 /* 492 * This routine is called by the ICMP module when it gets some 493 * sort of error condition. If err < 0 then the socket should 494 * be closed and the error returned to the user. If err > 0 495 * it's just the icmp type << 8 | icmp code. After adjustment 496 * header points to the first 8 bytes of the tcp header. We need 497 * to find the appropriate port. 498 */ 499
500 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */ 501 __u32saddr, structinet_protocol *protocol)
502 { 503 structtcphdr *th = (structtcphdr *)header;
504 structsock *sk;
505
506 /* 507 * This one is _WRONG_. FIXME urgently. 508 */ 509 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 510 structiphdr *iph=(structiphdr *)(header-sizeof(structiphdr));
511 #endif 512 th =(structtcphdr *)header;
513 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
514
515 if (sk == NULL)
516 return;
517
518 if (type == ICMP_SOURCE_QUENCH)
519 { 520 /* 521 * FIXME: 522 * For now we will just trigger a linear backoff. 523 * The slow start code should cause a real backoff here. 524 */ 525 if (sk->cong_window > 4)
526 sk->cong_window--;
527 return;
528 } 529
530 if (type == ICMP_PARAMETERPROB)
531 { 532 sk->err=EPROTO;
533 sk->error_report(sk);
534 } 535
536 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 537 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
538 { 539 structrtable * rt;
540 /* 541 * Ugly trick to pass MTU to protocol layer. 542 * Really we should add argument "info" to error handler. 543 */ 544 unsignedshortnew_mtu = ntohs(iph->id);
545
546 if ((rt = sk->ip_route_cache) != NULL)
547 if (rt->rt_mtu > new_mtu)
548 rt->rt_mtu = new_mtu;
549
550 if (sk->mtu > new_mtu - sizeof(structiphdr) - sizeof(structtcphdr)
551 && new_mtu > sizeof(structiphdr)+sizeof(structtcphdr))
552 sk->mtu = new_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
553
554 return;
555 } 556 #endif 557
558 /* 559 * If we've already connected we will keep trying 560 * until we time out, or the user gives up. 561 */ 562
563 if (code < 13)
564 { 565 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
566 { 567 sk->err = icmp_err_convert[code].errno;
568 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
569 { 570 tcp_statistics.TcpAttemptFails++;
571 tcp_set_state(sk,TCP_CLOSE);
572 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 573 } 574 } 575 else/* Only an error on timeout */ 576 sk->err_soft = icmp_err_convert[code].errno;
577 } 578 } 579
580
581 /* 582 * Walk down the receive queue counting readable data until we hit the end or we find a gap 583 * in the received data queue (ie a frame missing that needs sending to us). Not 584 * sorting using two queues as data arrives makes life so much harder. 585 */ 586
587 staticinttcp_readable(structsock *sk)
/* */ 588 { 589 unsignedlongcounted;
590 unsignedlongamount;
591 structsk_buff *skb;
592 intsum;
593 unsignedlongflags;
594
595 if(sk && sk->debug)
596 printk("tcp_readable: %p - ",sk);
597
598 save_flags(flags);
599 cli();
600 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
601 { 602 restore_flags(flags);
603 if(sk && sk->debug)
604 printk("empty\n");
605 return(0);
606 } 607
608 counted = sk->copied_seq; /* Where we are at the moment */ 609 amount = 0;
610
611 /* 612 * Do until a push or until we are out of data. 613 */ 614
615 do 616 { 617 if (before(counted, skb->seq)) /* Found a hole so stops here */ 618 break;
619 sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 620 if (skb->h.th->syn)
621 sum++;
622 if (sum > 0)
623 {/* Add it up, move on */ 624 amount += sum;
625 if (skb->h.th->syn)
626 amount--;
627 counted += sum;
628 } 629 /* 630 * Don't count urg data ... but do it in the right place! 631 * Consider: "old_data (ptr is here) URG PUSH data" 632 * The old code would stop at the first push because 633 * it counted the urg (amount==1) and then does amount-- 634 * *after* the loop. This means tcp_readable() always 635 * returned zero if any URG PUSH was in the queue, even 636 * though there was normal data available. If we subtract 637 * the urg data right here, we even get it to work for more 638 * than one URG PUSH skb without normal data. 639 * This means that select() finally works now with urg data 640 * in the queue. Note that rlogin was never affected 641 * because it doesn't use select(); it uses two processes 642 * and a blocking read(). And the queue scan in tcp_read() 643 * was correct. Mike <pall@rz.uni-karlsruhe.de> 644 */ 645 if (skb->h.th->urg)
646 amount--; /* don't count urg data */ 647 if (amount && skb->h.th->psh) break;
648 skb = skb->next;
649 } 650 while(skb != (structsk_buff *)&sk->receive_queue);
651
652 restore_flags(flags);
653 if(sk->debug)
654 printk("got %lu bytes.\n",amount);
655 return(amount);
656 } 657
658 /* 659 * LISTEN is a special case for select.. 660 */ 661 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 662 { 663 if (sel_type == SEL_IN) { 664 structsk_buff * skb;
665
666 lock_sock(sk);
667 skb = tcp_find_established(sk);
668 release_sock(sk);
669 if (skb)
670 return 1;
671 select_wait(sk->sleep,wait);
672 return 0;
673 } 674 return 0;
675 } 676
677
678 /* 679 * Wait for a TCP event. 680 * 681 * Note that we don't need to lock the socket, as the upper select layers 682 * take care of normal races (between the test and the event) and we don't 683 * go look at any of the socket buffers directly. 684 */ 685 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 686 { 687 if (sk->state == TCP_LISTEN)
688 returntcp_listen_select(sk, sel_type, wait);
689
690 switch(sel_type) { 691 caseSEL_IN:
692 if (sk->err)
693 return 1;
694 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
695 break;
696
697 if (sk->shutdown & RCV_SHUTDOWN)
698 return 1;
699
700 if (sk->acked_seq == sk->copied_seq)
701 break;
702
703 if (sk->urg_seq != sk->copied_seq ||
704 sk->acked_seq != sk->copied_seq+1 ||
705 sk->urginline || !sk->urg_data)
706 return 1;
707 break;
708
709 caseSEL_OUT:
710 if (sk->err)
711 return 1;
712 if (sk->shutdown & SEND_SHUTDOWN)
713 return 0;
714 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
715 break;
716 /* 717 * This is now right thanks to a small fix 718 * by Matt Dillon. 719 */ 720
721 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
722 break;
723 return 1;
724
725 caseSEL_EX:
726 if (sk->urg_data)
727 return 1;
728 break;
729 } 730 select_wait(sk->sleep, wait);
731 return 0;
732 } 733
734 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 735 { 736 interr;
737 switch(cmd)
738 { 739
740 caseTIOCINQ:
741 #ifdef FIXME /* FIXME: */ 742 caseFIONREAD:
743 #endif 744 { 745 unsignedlongamount;
746
747 if (sk->state == TCP_LISTEN)
748 return(-EINVAL);
749
750 lock_sock(sk);
751 amount = tcp_readable(sk);
752 release_sock(sk);
753 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
754 if(err)
755 returnerr;
756 put_user(amount, (int *)arg);
757 return(0);
758 } 759 caseSIOCATMARK:
760 { 761 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
762
763 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
764 if (err)
765 returnerr;
766 put_user(answ,(int *) arg);
767 return(0);
768 } 769 caseTIOCOUTQ:
770 { 771 unsignedlongamount;
772
773 if (sk->state == TCP_LISTEN) return(-EINVAL);
774 amount = sock_wspace(sk);
775 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
776 if(err)
777 returnerr;
778 put_user(amount, (int *)arg);
779 return(0);
780 } 781 default:
782 return(-EINVAL);
783 } 784 } 785
786
787 /* 788 * This routine computes a TCP checksum. 789 * 790 * Modified January 1995 from a go-faster DOS routine by 791 * Jorge Cwik <jorge@laser.satlink.net> 792 */ 793 #undefDEBUG_TCP_CHECK 794 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */ 795 unsignedlongdaddr, intlen, structsk_buff *skb)
796 { 797 #ifdefDEBUG_TCP_CHECK 798 u16check;
799 #endif 800 th->check = 0;
801 th->check = tcp_check(th, len, saddr, daddr,
802 csum_partial((char *)th,sizeof(*th),skb->csum));
803
804 #ifdefDEBUG_TCP_CHECK 805 check = th->check;
806 th->check = 0;
807 th->check = tcp_check(th, len, saddr, daddr,
808 csum_partial((char *)th,len,0));
809 if (check != th->check) { 810 staticintcount = 0;
811 if (++count < 10) { 812 printk("Checksum %x (%x) from %p\n", th->check, check,
813 (&th)[-1]);
814 printk("TCP=<off:%d a:%d s:%d f:%d>\n", th->doff*4, th->ack, th->syn, th->fin);
815 } 816 } 817 #endif 818 } 819
820
821 /* 822 * This routine builds a generic TCP header. 823 */ 824
825 staticinlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */ 826 { 827 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
828 th->psh = (push == 0) ? 1 : 0;
829 th->seq = htonl(sk->write_seq);
830 th->ack_seq = htonl(sk->acked_seq);
831 th->window = htons(tcp_select_window(sk));
832
833 return(sizeof(*th));
834 } 835
836 /* 837 * Wait for a socket to get into the connected state 838 */ 839 staticvoidwait_for_tcp_connect(structsock * sk)
/* */ 840 { 841 release_sock(sk);
842 cli();
843 if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0)
844 { 845 interruptible_sleep_on(sk->sleep);
846 } 847 sti();
848 lock_sock(sk);
849 } 850
851 /* 852 * Wait for more memory for a socket 853 */ 854 staticvoidwait_for_tcp_memory(structsock * sk)
/* */ 855 { 856 release_sock(sk);
857 cli();
858 if (sk->wmem_alloc*2 > sk->sndbuf &&
859 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
860 && sk->err == 0)
861 { 862 sk->socket->flags &= ~SO_NOSPACE;
863 interruptible_sleep_on(sk->sleep);
864 } 865 sti();
866 lock_sock(sk);
867 } 868
869
870 /* 871 * This routine copies from a user buffer into a socket, 872 * and starts the transmit system. 873 */ 874
875 staticintdo_tcp_sendmsg(structsock *sk,
/* */ 876 intiovlen, structiovec *iov,
877 intlen, intnonblock, intflags)
878 { 879 intcopied = 0;
880 structdevice *dev = NULL;
881
882 /* 883 * Wait for a connection to finish. 884 */ 885 while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
886 { 887 if (sk->err)
888 returnsock_error(sk);
889
890 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
891 { 892 if (sk->keepopen)
893 send_sig(SIGPIPE, current, 0);
894 return -EPIPE;
895 } 896
897 if (nonblock)
898 return -EAGAIN;
899
900 if (current->signal & ~current->blocked)
901 return -ERESTARTSYS;
902
903 wait_for_tcp_connect(sk);
904 } 905
906 /* 907 * Ok commence sending 908 */ 909
910 while (--iovlen >= 0)
911 { 912 intseglen=iov->iov_len;
913 unsignedchar * from=iov->iov_base;
914 iov++;
915
916 while(seglen > 0)
917 { 918 intcopy, delay;
919 inttmp;
920 structsk_buff *skb;
921
922 /* 923 * Stop on errors 924 */ 925 if (sk->err)
926 { 927 if (copied)
928 returncopied;
929 returnsock_error(sk);
930 } 931
932 /* 933 * Make sure that we are established. 934 */ 935 if (sk->shutdown & SEND_SHUTDOWN)
936 { 937 if (copied)
938 returncopied;
939 return -EPIPE;
940 } 941
942 /* 943 * The following code can result in copy <= if sk->mss is ever 944 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window). 945 * sk->mtu is constant once SYN processing is finished. I.e. we 946 * had better not get here until we've seen his SYN and at least one 947 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.) 948 * But ESTABLISHED should guarantee that. sk->max_window is by definition 949 * non-decreasing. Note that any ioctl to set user_mss must be done 950 * before the exchange of SYN's. If the initial ack from the other 951 * end has a window of 0, max_window and thus mss will both be 0. 952 */ 953
954 /* 955 * Now we need to check if we have a half built packet. 956 */ 957 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 958 /* 959 * FIXME: I'm almost sure that this fragment is BUG, 960 * but it works... I do not know why 8) --ANK 961 * 962 * Really, we should rebuild all the queues... 963 * It's difficult. Temporary hack is to send all 964 * queued segments with allowed fragmentation. 965 */ 966 { 967 intnew_mss = min(sk->mtu, sk->max_window);
968 if (new_mss < sk->mss)
969 { 970 tcp_send_partial(sk);
971 sk->mss = new_mss;
972 } 973 } 974 #endif 975
976 if ((skb = tcp_dequeue_partial(sk)) != NULL)
977 { 978 inttcp_size;
979
980 tcp_size = skb->tail - (unsignedchar *)(skb->h.th + 1);
981
982 /* Add more stuff to the end of skb->len */ 983 if (!(flags & MSG_OOB))
984 { 985 copy = min(sk->mss - tcp_size, seglen);
986 if (copy <= 0)
987 { 988 printk("TCP: **bug**: \"copy\" <= 0\n");
989 return -EFAULT;
990 } 991 tcp_size += copy;
992 memcpy_fromfs(skb_put(skb,copy), from, copy);
993 skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
994 from += copy;
995 copied += copy;
996 len -= copy;
997 sk->write_seq += copy;
998 seglen -= copy;
999 }1000 if (tcp_size >= sk->mss || (flags & MSG_OOB) || !sk->packets_out)
1001 tcp_send_skb(sk, skb);
1002 else1003 tcp_enqueue_partial(skb, sk);
1004 continue;
1005 }1006
1007 /*1008 * We also need to worry about the window.1009 * If window < 1/2 the maximum window we've seen from this1010 * host, don't use it. This is sender side1011 * silly window prevention, as specified in RFC1122.1012 * (Note that this is different than earlier versions of1013 * SWS prevention, e.g. RFC813.). What we actually do is1014 * use the whole MSS. Since the results in the right1015 * edge of the packet being outside the window, it will1016 * be queued for later rather than sent.1017 */1018
1019 copy = sk->window_seq - sk->write_seq;
1020 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1021 copy = sk->mss;
1022 if (copy > seglen)
1023 copy = seglen;
1024 if (copy <= 0)
1025 {1026 printk("TCP: **bug**: copy=%d, sk->mss=%d\n", copy, sk->mss);
1027 return -EFAULT;
1028 }1029
1030 /*1031 * We should really check the window here also.1032 */1033
1034 delay = 0;
1035 tmp = copy + sk->prot->max_header + 15;
1036 if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out)
1037 {1038 tmp = tmp - copy + sk->mtu + 128;
1039 delay = 1;
1040 }1041 skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
1042
1043 /*1044 * If we didn't get any memory, we need to sleep.1045 */1046
1047 if (skb == NULL)
1048 {1049 sk->socket->flags |= SO_NOSPACE;
1050 if (nonblock)
1051 {1052 if (copied)
1053 returncopied;
1054 return -EAGAIN;
1055 }1056
1057 if (current->signal & ~current->blocked)
1058 {1059 if (copied)
1060 returncopied;
1061 return -ERESTARTSYS;
1062 }1063
1064 wait_for_tcp_memory(sk);
1065 continue;
1066 }1067
1068 skb->sk = sk;
1069 skb->free = 0;
1070 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1071
1072 /*1073 * FIXME: we need to optimize this.1074 * Perhaps some hints here would be good.1075 */1076
1077 tmp = sk->prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1078 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1079 if (tmp < 0 )
1080 {1081 sock_wfree(sk, skb);
1082 if (copied)
1083 return(copied);
1084 return(tmp);
1085 }1086 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1087 skb->ip_hdr->frag_off |= htons(IP_DF);
1088 #endif1089 skb->dev = dev;
1090 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
1091 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1092 if (tmp < 0)
1093 {1094 sock_wfree(sk, skb);
1095 if (copied)
1096 return(copied);
1097 return(tmp);
1098 }1099
1100 if (flags & MSG_OOB)
1101 {1102 skb->h.th->urg = 1;
1103 skb->h.th->urg_ptr = ntohs(copy);
1104 }1105
1106 skb->csum = csum_partial_copy_fromuser(from,
1107 skb_put(skb,copy), copy, 0);
1108
1109 from += copy;
1110 copied += copy;
1111 len -= copy;
1112 seglen -= copy;
1113 skb->free = 0;
1114 sk->write_seq += copy;
1115
1116 if (delay)
1117 {1118 tcp_enqueue_partial(skb, sk);
1119 continue;
1120 }1121 tcp_send_skb(sk, skb);
1122 }1123 }1124 sk->err = 0;
1125
1126 returncopied;
1127 }1128
1129
1130 staticinttcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */1131 intlen, intnonblock, intflags)
1132 {1133 intretval = -EINVAL;
1134
1135 /*1136 * Do sanity checking for sendmsg/sendto/send1137 */1138
1139 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1140 gotoout;
1141 if (msg->msg_name) {1142 structsockaddr_in *addr=(structsockaddr_in *)msg->msg_name;
1143
1144 if (msg->msg_namelen < sizeof(*addr))
1145 gotoout;
1146 if (addr->sin_family && addr->sin_family != AF_INET)
1147 gotoout;
1148 retval = -ENOTCONN;
1149 if(sk->state == TCP_CLOSE)
1150 gotoout;
1151 retval = -EISCONN;
1152 if (addr->sin_port != sk->dummy_th.dest)
1153 gotoout;
1154 if (addr->sin_addr.s_addr != sk->daddr)
1155 gotoout;
1156 }1157
1158 lock_sock(sk);
1159 retval = do_tcp_sendmsg(sk, msg->msg_iovlen, msg->msg_iov, len, nonblock, flags);
1160
1161 /*1162 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1163 * interactive fast network servers. It's meant to be on and1164 * it really improves the throughput though not the echo time1165 * on my slow slip link - Alan1166 *1167 * If not nagling we can send on the before case too..1168 */1169
1170 if (sk->partial) {1171 if (!sk->packets_out ||
1172 (sk->nonagle && before(sk->write_seq , sk->window_seq))) {1173 tcp_send_partial(sk);
1174 }1175 }1176
1177 release_sock(sk);
1178
1179 out:
1180 returnretval;
1181 }1182
1183
1184 /*1185 * Send an ack if one is backlogged at this point.1186 *1187 * This is called for delayed acks also.1188 */1189
1190 voidtcp_read_wakeup(structsock *sk)
/* */1191 {1192 if (!sk->ack_backlog)
1193 return;
1194
1195 /*1196 * If we're closed, don't send an ack, or we'll get a RST1197 * from the closed destination.1198 */1199 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1200 return;
1201
1202 tcp_send_ack(sk);
1203 }1204
1205
1206 /*1207 * Handle reading urgent data. BSD has very simple semantics for1208 * this, no blocking and very strange errors 8)1209 */1210
1211 staticinttcp_recv_urg(structsock * sk, intnonblock,
/* */1212 structmsghdr *msg, intlen, intflags, int *addr_len)
1213 {1214 /*1215 * No URG data to read1216 */1217 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1218 return -EINVAL; /* Yes this is right ! */1219
1220 if (sk->err)
1221 returnsock_error(sk);
1222
1223 if (sk->state == TCP_CLOSE || sk->done)
1224 {1225 if (!sk->done)
1226 {1227 sk->done = 1;
1228 return 0;
1229 }1230 return -ENOTCONN;
1231 }1232
1233 if (sk->shutdown & RCV_SHUTDOWN)
1234 {1235 sk->done = 1;
1236 return 0;
1237 }1238 lock_sock(sk);
1239 if (sk->urg_data & URG_VALID)
1240 {1241 charc = sk->urg_data;
1242 if (!(flags & MSG_PEEK))
1243 sk->urg_data = URG_READ;
1244 memcpy_toiovec(msg->msg_iov, &c, 1);
1245 if(msg->msg_name)
1246 {1247 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
1248 sin->sin_family=AF_INET;
1249 sin->sin_addr.s_addr=sk->daddr;
1250 sin->sin_port=sk->dummy_th.dest;
1251 }1252 if(addr_len)
1253 *addr_len=sizeof(structsockaddr_in);
1254 release_sock(sk);
1255 return 1;
1256 }1257 release_sock(sk);
1258
1259 /*1260 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and1261 * the available implementations agree in this case:1262 * this call should never block, independent of the1263 * blocking state of the socket.1264 * Mike <pall@rz.uni-karlsruhe.de>1265 */1266 return -EAGAIN;
1267 }1268
1269 /*1270 * Release a skb if it is no longer needed. This routine1271 * must be called with interrupts disabled or with the1272 * socket locked so that the sk_buff queue operation is ok.1273 */1274
1275 staticinlinevoidtcp_eat_skb(structsock *sk, structsk_buff * skb)
/* */1276 {1277 skb->sk = sk;
1278 __skb_unlink(skb, &sk->receive_queue);
1279 kfree_skb(skb, FREE_READ);
1280 }1281
1282 /*1283 * FIXME:1284 * This routine frees used buffers.1285 * It should consider sending an ACK to let the1286 * other end know we now have a bigger window.1287 */1288
1289 staticvoidcleanup_rbuf(structsock *sk)
/* */1290 {1291 /*1292 * NOTE! The socket must be locked, so that we don't get1293 * a messed-up receive queue.1294 */1295 while (!skb_queue_empty(&sk->receive_queue)) {1296 structsk_buff *skb = sk->receive_queue.next;
1297 if (!skb->used || skb->users)
1298 break;
1299 tcp_eat_skb(sk, skb);
1300 }1301
1302 /*1303 * Tell the world if we raised the window.1304 */1305 if (tcp_raise_window(sk))
1306 tcp_send_ack(sk);
1307 }1308
1309
1310 /*1311 * This routine copies from a sock struct into the user buffer.1312 */1313
1314 staticinttcp_recvmsg(structsock *sk, structmsghdr *msg,
/* */1315 intlen, intnonblock, intflags, int *addr_len)
1316 {1317 structwait_queuewait = {current, NULL};
1318 intcopied = 0;
1319 u32peek_seq;
1320 volatileu32 *seq; /* So gcc doesn't overoptimise */1321 unsignedlongused;
1322
1323 /*1324 * This error should be checked.1325 */1326
1327 if (sk->state == TCP_LISTEN)
1328 return -ENOTCONN;
1329
1330 /*1331 * Urgent data needs to be handled specially.1332 */1333
1334 if (flags & MSG_OOB)
1335 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1336
1337 /*1338 * Copying sequence to update. This is volatile to handle1339 * the multi-reader case neatly (memcpy_to/fromfs might be1340 * inline and thus not flush cached variables otherwise).1341 */1342
1343 peek_seq = sk->copied_seq;
1344 seq = &sk->copied_seq;
1345 if (flags & MSG_PEEK)
1346 seq = &peek_seq;
1347
1348 add_wait_queue(sk->sleep, &wait);
1349 lock_sock(sk);
1350 while (len > 0)
1351 {1352 structsk_buff * skb;
1353 u32offset;
1354
1355 /*1356 * Are we at urgent data? Stop if we have read anything.1357 */1358
1359 if (copied && sk->urg_data && sk->urg_seq == *seq)
1360 break;
1361
1362 /*1363 * We need to check signals first, to get correct SIGURG1364 * handling.1365 */1366 if (current->signal & ~current->blocked) {1367 if (copied)
1368 break;
1369 copied = -ERESTARTSYS;
1370 break;
1371 }1372
1373 /*1374 * Next get a buffer.1375 */1376
1377 current->state = TASK_INTERRUPTIBLE;
1378
1379 skb = skb_peek(&sk->receive_queue);
1380 do1381 {1382 if (!skb)
1383 break;
1384 if (before(*seq, skb->seq))
1385 break;
1386 offset = *seq - skb->seq;
1387 if (skb->h.th->syn)
1388 offset--;
1389 if (offset < skb->len)
1390 gotofound_ok_skb;
1391 if (skb->h.th->fin)
1392 gotofound_fin_ok;
1393 if (!(flags & MSG_PEEK))
1394 skb->used = 1;
1395 skb = skb->next;
1396 }1397 while (skb != (structsk_buff *)&sk->receive_queue);
1398
1399 if (copied)
1400 break;
1401
1402 if (sk->err)
1403 {1404 copied = sock_error(sk);
1405 break;
1406 }1407
1408 if (sk->state == TCP_CLOSE)
1409 {1410 if (!sk->done)
1411 {1412 sk->done = 1;
1413 break;
1414 }1415 copied = -ENOTCONN;
1416 break;
1417 }1418
1419 if (sk->shutdown & RCV_SHUTDOWN)
1420 {1421 sk->done = 1;
1422 break;
1423 }1424
1425 if (nonblock)
1426 {1427 copied = -EAGAIN;
1428 break;
1429 }1430
1431 cleanup_rbuf(sk);
1432 release_sock(sk);
1433 sk->socket->flags |= SO_WAITDATA;
1434 schedule();
1435 sk->socket->flags &= ~SO_WAITDATA;
1436 lock_sock(sk);
1437 continue;
1438
1439 found_ok_skb:
1440 /*1441 * Lock the buffer. We can be fairly relaxed as1442 * an interrupt will never steal a buffer we are1443 * using unless I've missed something serious in1444 * tcp_data.1445 */1446
1447 skb->users++;
1448
1449 /*1450 * Ok so how much can we use ?1451 */1452
1453 used = skb->len - offset;
1454 if (len < used)
1455 used = len;
1456 /*1457 * Do we have urgent data here?1458 */1459
1460 if (sk->urg_data)
1461 {1462 u32urg_offset = sk->urg_seq - *seq;
1463 if (urg_offset < used)
1464 {1465 if (!urg_offset)
1466 {1467 if (!sk->urginline)
1468 {1469 ++*seq;
1470 offset++;
1471 used--;
1472 }1473 }1474 else1475 used = urg_offset;
1476 }1477 }1478
1479 /*1480 * Copy it - We _MUST_ update *seq first so that we1481 * don't ever double read when we have dual readers1482 */1483
1484 *seq += used;
1485
1486 /*1487 * This memcpy_tofs can sleep. If it sleeps and we1488 * do a second read it relies on the skb->users to avoid1489 * a crash when cleanup_rbuf() gets called.1490 */1491
1492 memcpy_toiovec(msg->msg_iov,((unsignedchar *)skb->h.th) +
1493 skb->h.th->doff*4 + offset, used);
1494 copied += used;
1495 len -= used;
1496
1497 /*1498 * We now will not sleep again until we are finished1499 * with skb. Sorry if you are doing the SMP port1500 * but you'll just have to fix it neatly ;)1501 */1502
1503 skb->users --;
1504
1505 if (after(sk->copied_seq,sk->urg_seq))
1506 sk->urg_data = 0;
1507 if (used + offset < skb->len)
1508 continue;
1509
1510 /*1511 * Process the FIN.1512 */1513
1514 if (skb->h.th->fin)
1515 gotofound_fin_ok;
1516 if (flags & MSG_PEEK)
1517 continue;
1518 skb->used = 1;
1519 if (!skb->users)
1520 tcp_eat_skb(sk, skb);
1521 continue;
1522
1523 found_fin_ok:
1524 ++*seq;
1525 if (flags & MSG_PEEK)
1526 break;
1527
1528 /*1529 * All is done1530 */1531
1532 skb->used = 1;
1533 sk->shutdown |= RCV_SHUTDOWN;
1534 break;
1535
1536 }1537
1538 if(copied>0 && msg->msg_name)
1539 {1540 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
1541 sin->sin_family=AF_INET;
1542 sin->sin_addr.s_addr=sk->daddr;
1543 sin->sin_port=sk->dummy_th.dest;
1544 }1545 if(addr_len)
1546 *addr_len=sizeof(structsockaddr_in);
1547
1548 remove_wait_queue(sk->sleep, &wait);
1549 current->state = TASK_RUNNING;
1550
1551 /* Clean up data we have read: This will do ACK frames */1552 cleanup_rbuf(sk);
1553 release_sock(sk);
1554 returncopied;
1555 }1556
1557
1558
1559 /*1560 * State processing on a close. This implements the state shift for1561 * sending our FIN frame. Note that we only send a FIN for some1562 * states. A shutdown() may have already sent the FIN, or we may be1563 * closed.1564 */1565
1566 staticinttcp_close_state(structsock *sk, intdead)
/* */1567 {1568 intns=TCP_CLOSE;
1569 intsend_fin=0;
1570 switch(sk->state)
1571 {1572 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */1573 break;
1574 caseTCP_SYN_RECV:
1575 caseTCP_ESTABLISHED: /* Closedown begin */1576 ns=TCP_FIN_WAIT1;
1577 send_fin=1;
1578 break;
1579 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */1580 caseTCP_FIN_WAIT2:
1581 caseTCP_CLOSING:
1582 ns=sk->state;
1583 break;
1584 caseTCP_CLOSE:
1585 caseTCP_LISTEN:
1586 break;
1587 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and1588 wait only for the ACK */1589 ns=TCP_LAST_ACK;
1590 send_fin=1;
1591 }1592
1593 tcp_set_state(sk,ns);
1594
1595 /*1596 * This is a (useful) BSD violating of the RFC. There is a1597 * problem with TCP as specified in that the other end could1598 * keep a socket open forever with no application left this end.1599 * We use a 3 minute timeout (about the same as BSD) then kill1600 * our end. If they send after that then tough - BUT: long enough1601 * that we won't make the old 4*rto = almost no time - whoops1602 * reset mistake.1603 */1604 if(dead && ns==TCP_FIN_WAIT2)
1605 {1606 inttimer_active=del_timer(&sk->timer);
1607 if(timer_active)
1608 add_timer(&sk->timer);
1609 else1610 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
1611 }1612
1613 returnsend_fin;
1614 }1615
1616 /*1617 * Shutdown the sending side of a connection. Much like close except1618 * that we don't receive shut down or set sk->dead.1619 */1620
1621 voidtcp_shutdown(structsock *sk, inthow)
/* */1622 {1623 /*1624 * We need to grab some memory, and put together a FIN,1625 * and then put it into the queue to be sent.1626 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.1627 */1628
1629 if (!(how & SEND_SHUTDOWN))
1630 return;
1631
1632 /*1633 * If we've already sent a FIN, or it's a closed state1634 */1635
1636 if (sk->state == TCP_FIN_WAIT1 ||
1637 sk->state == TCP_FIN_WAIT2 ||
1638 sk->state == TCP_CLOSING ||
1639 sk->state == TCP_LAST_ACK ||
1640 sk->state == TCP_TIME_WAIT ||
1641 sk->state == TCP_CLOSE ||
1642 sk->state == TCP_LISTEN1643 )
1644 {1645 return;
1646 }1647 lock_sock(sk);
1648
1649 /*1650 * flag that the sender has shutdown1651 */1652
1653 sk->shutdown |= SEND_SHUTDOWN;
1654
1655 /*1656 * Clear out any half completed packets.1657 */1658
1659 if (sk->partial)
1660 tcp_send_partial(sk);
1661
1662 /*1663 * FIN if needed1664 */1665
1666 if (tcp_close_state(sk,0))
1667 tcp_send_fin(sk);
1668
1669 release_sock(sk);
1670 }1671
1672
1673 /*1674 * Return 1 if we still have things to send in our buffers.1675 */1676
1677 staticinlineintclosing(structsock * sk)
/* */1678 {1679 switch (sk->state) {1680 caseTCP_FIN_WAIT1:
1681 caseTCP_CLOSING:
1682 caseTCP_LAST_ACK:
1683 return 1;
1684 }1685 return 0;
1686 }1687
1688
1689 staticvoidtcp_close(structsock *sk, unsignedlongtimeout)
/* */1690 {1691 structsk_buff *skb;
1692
1693 /*1694 * We need to grab some memory, and put together a FIN,1695 * and then put it into the queue to be sent.1696 */1697
1698 lock_sock(sk);
1699
1700 tcp_cache_zap();
1701 if(sk->state == TCP_LISTEN)
1702 {1703 /* Special case */1704 tcp_set_state(sk, TCP_CLOSE);
1705 tcp_close_pending(sk);
1706 release_sock(sk);
1707 sk->dead = 1;
1708 return;
1709 }1710
1711 sk->keepopen = 1;
1712 sk->shutdown = SHUTDOWN_MASK;
1713
1714 if (!sk->dead)
1715 sk->state_change(sk);
1716
1717 /*1718 * We need to flush the recv. buffs. We do this only on the1719 * descriptor close, not protocol-sourced closes, because the1720 * reader process may not have drained the data yet!1721 */1722
1723 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
1724 kfree_skb(skb, FREE_READ);
1725
1726 /*1727 * Get rid off any half-completed packets.1728 */1729
1730 if (sk->partial)
1731 tcp_send_partial(sk);
1732
1733 /*1734 * Timeout is not the same thing - however the code likes1735 * to send both the same way (sigh).1736 */1737
1738 if (tcp_close_state(sk,1)==1)
1739 {1740 tcp_send_fin(sk);
1741 }1742
1743 if (timeout) {1744 cli();
1745 release_sock(sk);
1746 current->timeout = timeout;
1747 while(closing(sk) && current->timeout)
1748 {1749 interruptible_sleep_on(sk->sleep);
1750 if (current->signal & ~current->blocked)
1751 {1752 break;
1753 }1754 }1755 current->timeout=0;
1756 lock_sock(sk);
1757 sti();
1758 }1759
1760 /*1761 * This will destroy it. The timers will take care of actually1762 * free'ing up the memory.1763 */1764 tcp_cache_zap(); /* Kill the cache again. */1765 release_sock(sk);
1766 sk->dead = 1;
1767 }1768
1769
1770 /*1771 * Wait for a incoming connection, avoid race1772 * conditions. This must be called with the socket1773 * locked.1774 */1775 staticstructsk_buff * wait_for_connect(structsock * sk)
/* */1776 {1777 structwait_queuewait = {current, NULL};
1778 structsk_buff * skb = NULL;
1779
1780 add_wait_queue(sk->sleep, &wait);
1781 for (;;) {1782 current->state = TASK_INTERRUPTIBLE;
1783 release_sock(sk);
1784 schedule();
1785 lock_sock(sk);
1786 skb = tcp_find_established(sk);
1787 if (skb)
1788 break;
1789 if (current->signal & ~current->blocked)
1790 break;
1791 }1792 remove_wait_queue(sk->sleep, &wait);
1793 returnskb;
1794 }1795
1796 /*1797 * This will accept the next outstanding connection.1798 *1799 * Be careful about race conditions here - this is subtle.1800 */1801
1802 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */1803 {1804 interror;
1805 structsk_buff *skb;
1806 structsock *newsk = NULL;
1807
1808 /*1809 * We need to make sure that this socket is listening,1810 * and that it has something pending.1811 */1812
1813 error = EINVAL;
1814 if (sk->state != TCP_LISTEN)
1815 gotono_listen;
1816
1817 lock_sock(sk);
1818
1819 skb = tcp_find_established(sk);
1820 if (skb) {1821 got_new_connect:
1822 __skb_unlink(skb, &sk->receive_queue);
1823 newsk = skb->sk;
1824 kfree_skb(skb, FREE_READ);
1825 sk->ack_backlog--;
1826 error = 0;
1827 out:
1828 release_sock(sk);
1829 no_listen:
1830 sk->err = error;
1831 returnnewsk;
1832 }1833
1834 error = EAGAIN;
1835 if (flags & O_NONBLOCK)
1836 gotoout;
1837 skb = wait_for_connect(sk);
1838 if (skb)
1839 gotogot_new_connect;
1840 error = ERESTARTSYS;
1841 gotoout;
1842 }1843
1844
1845 /*1846 * This will initiate an outgoing connection.1847 */1848
1849 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */1850 {1851 structsk_buff *buff;
1852 structdevice *dev=NULL;
1853 unsignedchar *ptr;
1854 inttmp;
1855 intatype;
1856 structtcphdr *t1;
1857 structrtable *rt;
1858
1859 if (sk->state != TCP_CLOSE)
1860 return(-EISCONN);
1861
1862 /*1863 * Don't allow a double connect.1864 */1865
1866 if(sk->daddr)
1867 return -EINVAL;
1868
1869 if (addr_len < 8)
1870 return(-EINVAL);
1871
1872 if (usin->sin_family && usin->sin_family != AF_INET)
1873 return(-EAFNOSUPPORT);
1874
1875 /*1876 * connect() to INADDR_ANY means loopback (BSD'ism).1877 */1878
1879 if(usin->sin_addr.s_addr==INADDR_ANY)
1880 usin->sin_addr.s_addr=ip_my_addr();
1881
1882 /*1883 * Don't want a TCP connection going to a broadcast address1884 */1885
1886 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
1887 return -ENETUNREACH;
1888
1889 lock_sock(sk);
1890 sk->daddr = usin->sin_addr.s_addr;
1891 sk->write_seq = tcp_init_seq();
1892 sk->window_seq = sk->write_seq;
1893 sk->rcv_ack_seq = sk->write_seq -1;
1894 sk->rcv_ack_cnt = 1;
1895 sk->err = 0;
1896 sk->dummy_th.dest = usin->sin_port;
1897 release_sock(sk);
1898
1899 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
1900 if (buff == NULL)
1901 {1902 return(-ENOMEM);
1903 }1904 lock_sock(sk);
1905 buff->sk = sk;
1906 buff->free = 0;
1907 buff->localroute = sk->localroute;
1908
1909
1910 /*1911 * Put in the IP header and routing stuff.1912 */1913
1914 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1915 IPPROTO_TCP, sk->opt, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1916 if (tmp < 0)
1917 {1918 sock_wfree(sk, buff);
1919 release_sock(sk);
1920 return(-ENETUNREACH);
1921 }1922 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
1923 sk->saddr = rt->rt_src;
1924 sk->rcv_saddr = sk->saddr;
1925
1926 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
1927
1928 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
1929 buff->seq = sk->write_seq++;
1930 t1->seq = htonl(buff->seq);
1931 sk->sent_seq = sk->write_seq;
1932 buff->end_seq = sk->write_seq;
1933 t1->ack = 0;
1934 t1->window = 2;
1935 t1->syn = 1;
1936 t1->doff = 6;
1937 /* use 512 or whatever user asked for */1938
1939 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
1940 sk->window_clamp=rt->rt_window;
1941 else1942 sk->window_clamp=0;
1943
1944 if (sk->user_mss)
1945 sk->mtu = sk->user_mss;
1946 elseif (rt)
1947 sk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
1948 else1949 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
1950
1951 /*1952 * but not bigger than device MTU1953 */1954
1955 if(sk->mtu <32)
1956 sk->mtu = 32; /* Sanity limit */1957
1958 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
1959
1960 #ifdefCONFIG_SKIP1961
1962 /*1963 * SKIP devices set their MTU to 65535. This is so they can take packets1964 * unfragmented to security process then fragment. They could lie to the1965 * TCP layer about a suitable MTU, but it's easier to let skip sort it out1966 * simply because the final package we want unfragmented is going to be1967 *1968 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]1969 */1970
1971 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */1972 sk->mtu=skip_pick_mtu(sk->mtu,dev);
1973 #endif1974
1975 /*1976 * Put in the TCP options to say MTU.1977 */1978
1979 ptr = skb_put(buff,4);
1980 ptr[0] = 2;
1981 ptr[1] = 4;
1982 ptr[2] = (sk->mtu) >> 8;
1983 ptr[3] = (sk->mtu) & 0xff;
1984 buff->csum = csum_partial(ptr, 4, 0);
1985 tcp_send_check(t1, sk->saddr, sk->daddr,
1986 sizeof(structtcphdr) + 4, buff);
1987
1988 /*1989 * This must go first otherwise a really quick response will get reset.1990 */1991
1992 tcp_cache_zap();
1993 tcp_set_state(sk,TCP_SYN_SENT);
1994 if(rt&&rt->rt_flags&RTF_IRTT)
1995 sk->rto = rt->rt_irtt;
1996 else1997 sk->rto = TCP_TIMEOUT_INIT;
1998 sk->delack_timer.function = tcp_delack_timer;
1999 sk->delack_timer.data = (unsignedlong) sk;
2000 sk->retransmit_timer.function = tcp_retransmit_timer;
2001 sk->retransmit_timer.data = (unsignedlong)sk;
2002 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */2003 sk->retransmits = 0; /* Now works the right way instead of a hacked2004 initial setting */2005
2006 sk->prot->queue_xmit(sk, dev, buff, 0);
2007 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2008 tcp_statistics.TcpActiveOpens++;
2009 tcp_statistics.TcpOutSegs++;
2010
2011 release_sock(sk);
2012 return(0);
2013 }2014
2015 /*2016 * Socket option code for TCP.2017 */2018
2019 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */2020 {2021 intval,err;
2022
2023 if(level!=SOL_TCP)
2024 returnip_setsockopt(sk,level,optname,optval,optlen);
2025
2026 if (optval == NULL)
2027 return(-EINVAL);
2028
2029 err=verify_area(VERIFY_READ, optval, sizeof(int));
2030 if(err)
2031 returnerr;
2032
2033 val = get_user((int *)optval);
2034
2035 switch(optname)
2036 {2037 caseTCP_MAXSEG:
2038 /*2039 * values greater than interface MTU won't take effect. however at2040 * the point when this call is done we typically don't yet know2041 * which interface is going to be used2042 */2043 if(val<1||val>MAX_WINDOW)
2044 return -EINVAL;
2045 sk->user_mss=val;
2046 return 0;
2047 caseTCP_NODELAY:
2048 sk->nonagle=(val==0)?0:1;
2049 return 0;
2050 default:
2051 return(-ENOPROTOOPT);
2052 }2053 }2054
2055 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */2056 {2057 intval,err;
2058
2059 if(level!=SOL_TCP)
2060 returnip_getsockopt(sk,level,optname,optval,optlen);
2061
2062 switch(optname)
2063 {2064 caseTCP_MAXSEG:
2065 val=sk->user_mss;
2066 break;
2067 caseTCP_NODELAY:
2068 val=sk->nonagle;
2069 break;
2070 default:
2071 return(-ENOPROTOOPT);
2072 }2073 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2074 if(err)
2075 returnerr;
2076 put_user(sizeof(int),(int *) optlen);
2077
2078 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
2079 if(err)
2080 returnerr;
2081 put_user(val,(int *)optval);
2082
2083 return(0);
2084 }2085
2086
2087 structprototcp_prot = {2088 tcp_close,
2089 ip_build_header,
2090 tcp_connect,
2091 tcp_accept,
2092 ip_queue_xmit,
2093 tcp_retransmit,
2094 tcp_write_wakeup,
2095 tcp_read_wakeup,
2096 tcp_rcv,
2097 tcp_select,
2098 tcp_ioctl,
2099 NULL,
2100 tcp_shutdown,
2101 tcp_setsockopt,
2102 tcp_getsockopt,
2103 tcp_sendmsg,
2104 tcp_recvmsg,
2105 NULL, /* No special bind() */2106 128,
2107 0,
2108 "TCP",
2109 0, 0,
2110 {NULL,}2111 };