1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_send_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications. 178 * Alan Cox : rcv_saddr errors. 179 * Alan Cox : Block double connect(). 180 * Alan Cox : Small hooks for enSKIP. 181 * Alexey Kuznetsov: Path MTU discovery. 182 * Alan Cox : Support soft errors. 183 * Alan Cox : Fix MTU discovery pathological case 184 * when the remote claims no mtu! 185 * Marc Tamsky : TCP_CLOSE fix. 186 * Colin (G3TNE) : Send a reset on syn ack replies in 187 * window but wrong (fixes NT lpd problems) 188 * Pedro Roque : Better TCP window handling, delayed ack. 189 * Joerg Reuter : No modification of locked buffers in 190 * tcp_do_retransmit() 191 * Eric Schenk : Changed receiver side silly window 192 * avoidance algorithm to BSD style 193 * algorithm. This doubles throughput 194 * against machines running Solaris, 195 * and seems to result in general 196 * improvement. 197 * Eric Schenk : Changed receiver side silly window 198 * avoidance algorithm to BSD style 199 * algorithm. This doubles throughput 200 * against machines running Solaris, 201 * and seems to result in general 202 * improvement. 203 * 204 * To Fix: 205 * Fast path the code. Two things here - fix the window calculation 206 * so it doesn't iterate over the queue, also spot packets with no funny 207 * options arriving in order and process directly. 208 * 209 * Rewrite output state machine to use a single queue. 210 * Speed up input assembly algorithm. 211 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 212 * could do with it working on IPv4 213 * User settable/learned rtt/max window/mtu 214 * 215 * Change the fundamental structure to a single send queue maintained 216 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 217 * active routes too]). Cut the queue off in tcp_retransmit/ 218 * tcp_transmit. 219 * Change the receive queue to assemble as it goes. This lets us 220 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 221 * tcp_data/tcp_read as well as the window shrink crud. 222 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 223 * tcp_queue_skb seem obvious routines to extract. 224 * 225 * This program is free software; you can redistribute it and/or 226 * modify it under the terms of the GNU General Public License 227 * as published by the Free Software Foundation; either version 228 * 2 of the License, or(at your option) any later version. 229 * 230 * Description of States: 231 * 232 * TCP_SYN_SENT sent a connection request, waiting for ack 233 * 234 * TCP_SYN_RECV received a connection request, sent ack, 235 * waiting for final ack in three-way handshake. 236 * 237 * TCP_ESTABLISHED connection established 238 * 239 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 240 * transmission of remaining buffered data 241 * 242 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 243 * to shutdown 244 * 245 * TCP_CLOSING both sides have shutdown but we still have 246 * data we have to finish sending 247 * 248 * TCP_TIME_WAIT timeout to catch resent junk before entering 249 * closed, can only be entered from FIN_WAIT2 250 * or CLOSING. Required because the other end 251 * may not have gotten our last ACK causing it 252 * to retransmit the data packet (which we ignore) 253 * 254 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 255 * us to finish writing our data and to shutdown 256 * (we have to close() to move on to LAST_ACK) 257 * 258 * TCP_LAST_ACK out side has shutdown after remote has 259 * shutdown. There may still be data in our 260 * buffer that we have to finish sending 261 * 262 * TCP_CLOSE socket is finished 263 */ 264
265 /* 266 * RFC1122 status: 267 * NOTE: I'm not going to be doing comments in the code for this one except 268 * for violations and the like. tcp.c is just too big... If I say something 269 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 270 * with Alan. -- MS 950903 271 * 272 * Use of PSH (4.2.2.2) 273 * MAY aggregate data sent without the PSH flag. (does) 274 * MAY queue data received without the PSH flag. (does) 275 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 276 * MAY implement PSH on send calls. (doesn't, thus:) 277 * MUST NOT buffer data indefinitely (doesn't [1 second]) 278 * MUST set PSH on last segment (does) 279 * MAY pass received PSH to application layer (doesn't) 280 * SHOULD send maximum-sized segment whenever possible. (almost always does) 281 * 282 * Window Size (4.2.2.3, 4.2.2.16) 283 * MUST treat window size as an unsigned number (does) 284 * SHOULD treat window size as a 32-bit number (does not) 285 * MUST NOT shrink window once it is offered (does not normally) 286 * 287 * Urgent Pointer (4.2.2.4) 288 * **MUST point urgent pointer to last byte of urgent data (not right 289 * after). (doesn't, to be like BSD) 290 * MUST inform application layer asynchronously of incoming urgent 291 * data. (does) 292 * MUST provide application with means of determining the amount of 293 * urgent data pending. (does) 294 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 295 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 296 * [Follows BSD 1 byte of urgent data] 297 * 298 * TCP Options (4.2.2.5) 299 * MUST be able to receive TCP options in any segment. (does) 300 * MUST ignore unsupported options (does) 301 * 302 * Maximum Segment Size Option (4.2.2.6) 303 * MUST implement both sending and receiving MSS. (does) 304 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send 305 * it always). (does, even when MSS == 536, which is legal) 306 * MUST assume MSS == 536 if no MSS received at connection setup (does) 307 * MUST calculate "effective send MSS" correctly: 308 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 309 * (does - but allows operator override) 310 * 311 * TCP Checksum (4.2.2.7) 312 * MUST generate and check TCP checksum. (does) 313 * 314 * Initial Sequence Number Selection (4.2.2.8) 315 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 316 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 317 * necessary for 10Mbps networks - and harder than BSD to spoof!) 318 * 319 * Simultaneous Open Attempts (4.2.2.10) 320 * MUST support simultaneous open attempts (does) 321 * 322 * Recovery from Old Duplicate SYN (4.2.2.11) 323 * MUST keep track of active vs. passive open (does) 324 * 325 * RST segment (4.2.2.12) 326 * SHOULD allow an RST segment to contain data (does, but doesn't do 327 * anything with it, which is standard) 328 * 329 * Closing a Connection (4.2.2.13) 330 * MUST inform application of whether connection was closed by RST or 331 * normal close. (does) 332 * MAY allow "half-duplex" close (treat connection as closed for the 333 * local app, even before handshake is done). (does) 334 * MUST linger in TIME_WAIT for 2 * MSL (does) 335 * 336 * Retransmission Timeout (4.2.2.15) 337 * MUST implement Jacobson's slow start and congestion avoidance 338 * stuff. (does) 339 * 340 * Probing Zero Windows (4.2.2.17) 341 * MUST support probing of zero windows. (does) 342 * MAY keep offered window closed indefinitely. (does) 343 * MUST allow remote window to stay closed indefinitely. (does) 344 * 345 * Passive Open Calls (4.2.2.18) 346 * MUST NOT let new passive open affect other connections. (doesn't) 347 * MUST support passive opens (LISTENs) concurrently. (does) 348 * 349 * Time to Live (4.2.2.19) 350 * MUST make TCP TTL configurable. (does - IP_TTL option) 351 * 352 * Event Processing (4.2.2.20) 353 * SHOULD queue out-of-order segments. (does) 354 * MUST aggregate ACK segments whenever possible. (does but badly) 355 * 356 * Retransmission Timeout Calculation (4.2.3.1) 357 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 358 * calculation. (does, or at least explains them in the comments 8*b) 359 * SHOULD initialize RTO to 0 and RTT to 3. (does) 360 * 361 * When to Send an ACK Segment (4.2.3.2) 362 * SHOULD implement delayed ACK. (does) 363 * MUST keep ACK delay < 0.5 sec. (does) 364 * 365 * When to Send a Window Update (4.2.3.3) 366 * MUST implement receiver-side SWS. (does) 367 * 368 * When to Send Data (4.2.3.4) 369 * MUST implement sender-side SWS. (does) 370 * SHOULD implement Nagle algorithm. (does) 371 * 372 * TCP Connection Failures (4.2.3.5) 373 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 374 * SHOULD inform application layer of soft errors. (does) 375 * 376 * TCP Keep-Alives (4.2.3.6) 377 * MAY provide keep-alives. (does) 378 * MUST make keep-alives configurable on a per-connection basis. (does) 379 * MUST default to no keep-alives. (does) 380 * **MUST make keep-alive interval configurable. (doesn't) 381 * **MUST make default keep-alive interval > 2 hours. (doesn't) 382 * MUST NOT interpret failure to ACK keep-alive packet as dead 383 * connection. (doesn't) 384 * SHOULD send keep-alive with no data. (does) 385 * 386 * TCP Multihoming (4.2.3.7) 387 * MUST get source address from IP layer before sending first 388 * SYN. (does) 389 * MUST use same local address for all segments of a connection. (does) 390 * 391 * IP Options (4.2.3.8) 392 * MUST ignore unsupported IP options. (does) 393 * MAY support Time Stamp and Record Route. (does) 394 * MUST allow application to specify a source route. (does) 395 * MUST allow received Source Route option to set route for all future 396 * segments on this connection. (does not (security issues)) 397 * 398 * ICMP messages (4.2.3.9) 399 * MUST act on ICMP errors. (does) 400 * MUST slow transmission upon receipt of a Source Quench. (does) 401 * MUST NOT abort connection upon receipt of soft Destination 402 * Unreachables (0, 1, 5), Time Exceededs and Parameter 403 * Problems. (doesn't) 404 * SHOULD report soft Destination Unreachables etc. to the 405 * application. (does) 406 * SHOULD abort connection upon receipt of hard Destination Unreachable 407 * messages (2, 3, 4). (does) 408 * 409 * Remote Address Validation (4.2.3.10) 410 * MUST reject as an error OPEN for invalid remote IP address. (does) 411 * MUST ignore SYN with invalid source address. (does) 412 * MUST silently discard incoming SYN for broadcast/multicast 413 * address. (does) 414 * 415 * Asynchronous Reports (4.2.4.1) 416 * MUST provide mechanism for reporting soft errors to application 417 * layer. (does) 418 * 419 * Type of Service (4.2.4.2) 420 * MUST allow application layer to set Type of Service. (does IP_TOS) 421 * 422 * (Whew. -- MS 950903) 423 **/ 424
425 #include <linux/config.h>
426 #include <linux/types.h>
427 #include <linux/fcntl.h>
428
429 #include <net/icmp.h>
430 #include <net/tcp.h>
431
432 #include <asm/segment.h>
433
434 unsignedlongseq_offset;
435 structtcp_mibtcp_statistics;
436
437 staticvoidtcp_close(structsock *sk, unsignedlongtimeout);
438
439 /* 440 * The less said about this the better, but it works and will do for 1.2 (and 1.4 ;)) 441 */ 442
443 structwait_queue *master_select_wakeup;
444
445 /* 446 * Find someone to 'accept'. Must be called with 447 * the socket locked or with interrupts disabled 448 */ 449
450 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 451 { 452 structsk_buff *p=skb_peek(&s->receive_queue);
453 if(p==NULL)
454 returnNULL;
455 do 456 { 457 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
458 returnp;
459 p=p->next;
460 } 461 while(p!=(structsk_buff *)&s->receive_queue);
462 returnNULL;
463 } 464
465 /* 466 * Remove a completed connection and return it. This is used by 467 * tcp_accept() to get connections from the queue. 468 */ 469
470 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 471 { 472 structsk_buff *skb;
473 unsignedlongflags;
474 save_flags(flags);
475 cli();
476 skb=tcp_find_established(s);
477 if(skb!=NULL)
478 skb_unlink(skb); /* Take it off the queue */ 479 restore_flags(flags);
480 returnskb;
481 } 482
483 /* 484 * This routine closes sockets which have been at least partially 485 * opened, but not yet accepted. Currently it is only called by 486 * tcp_close, and timeout mirrors the value there. 487 */ 488
489 staticvoidtcp_close_pending (structsock *sk)
/* */ 490 { 491 structsk_buff *skb;
492
493 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
494 { 495 tcp_close(skb->sk, 0);
496 kfree_skb(skb, FREE_READ);
497 } 498 return;
499 } 500
501 /* 502 * Enter the time wait state. 503 */ 504
505 voidtcp_time_wait(structsock *sk)
/* */ 506 { 507 tcp_set_state(sk,TCP_TIME_WAIT);
508 sk->shutdown = SHUTDOWN_MASK;
509 if (!sk->dead)
510 sk->state_change(sk);
511 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
512 } 513
514
515 /* 516 * This routine is called by the ICMP module when it gets some 517 * sort of error condition. If err < 0 then the socket should 518 * be closed and the error returned to the user. If err > 0 519 * it's just the icmp type << 8 | icmp code. After adjustment 520 * header points to the first 8 bytes of the tcp header. We need 521 * to find the appropriate port. 522 */ 523
524 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */ 525 __u32saddr, structinet_protocol *protocol)
526 { 527 structtcphdr *th = (structtcphdr *)header;
528 structsock *sk;
529
530 /* 531 * This one is _WRONG_. FIXME urgently. 532 */ 533 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 534 structiphdr *iph=(structiphdr *)(header-sizeof(structiphdr));
535 #endif 536 th =(structtcphdr *)header;
537 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
538
539 if (sk == NULL)
540 return;
541
542 if (type == ICMP_SOURCE_QUENCH)
543 { 544 /* 545 * FIXME: 546 * For now we will just trigger a linear backoff. 547 * The slow start code should cause a real backoff here. 548 */ 549 if (sk->cong_window > 4)
550 sk->cong_window--;
551 return;
552 } 553
554 if (type == ICMP_PARAMETERPROB)
555 { 556 sk->err=EPROTO;
557 sk->error_report(sk);
558 } 559
560 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 561 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
562 { 563 structrtable * rt;
564 /* 565 * Ugly trick to pass MTU to protocol layer. 566 * Really we should add argument "info" to error handler. 567 */ 568 unsignedshortnew_mtu = ntohs(iph->id);
569
570 if ((rt = sk->ip_route_cache) != NULL)
571 if (rt->rt_mtu > new_mtu)
572 rt->rt_mtu = new_mtu;
573
574 if (sk->mtu > new_mtu - sizeof(structiphdr) - sizeof(structtcphdr)
575 && new_mtu > sizeof(structiphdr)+sizeof(structtcphdr))
576 sk->mtu = new_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
577
578 return;
579 } 580 #endif 581
582 /* 583 * If we've already connected we will keep trying 584 * until we time out, or the user gives up. 585 */ 586
587 if (code < 13)
588 { 589 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
590 { 591 sk->err = icmp_err_convert[code].errno;
592 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
593 { 594 tcp_statistics.TcpAttemptFails++;
595 tcp_set_state(sk,TCP_CLOSE);
596 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 597 } 598 } 599 else/* Only an error on timeout */ 600 sk->err_soft = icmp_err_convert[code].errno;
601 } 602 } 603
604
605 /* 606 * Walk down the receive queue counting readable data until we hit the end or we find a gap 607 * in the received data queue (ie a frame missing that needs sending to us). Not 608 * sorting using two queues as data arrives makes life so much harder. 609 */ 610
611 staticinttcp_readable(structsock *sk)
/* */ 612 { 613 unsignedlongcounted;
614 unsignedlongamount;
615 structsk_buff *skb;
616 intsum;
617 unsignedlongflags;
618
619 if(sk && sk->debug)
620 printk("tcp_readable: %p - ",sk);
621
622 save_flags(flags);
623 cli();
624 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
625 { 626 restore_flags(flags);
627 if(sk && sk->debug)
628 printk("empty\n");
629 return(0);
630 } 631
632 counted = sk->copied_seq; /* Where we are at the moment */ 633 amount = 0;
634
635 /* 636 * Do until a push or until we are out of data. 637 */ 638
639 do 640 { 641 if (before(counted, skb->seq)) /* Found a hole so stops here */ 642 break;
643 sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 644 if (skb->h.th->syn)
645 sum++;
646 if (sum > 0)
647 {/* Add it up, move on */ 648 amount += sum;
649 if (skb->h.th->syn)
650 amount--;
651 counted += sum;
652 } 653 /* 654 * Don't count urg data ... but do it in the right place! 655 * Consider: "old_data (ptr is here) URG PUSH data" 656 * The old code would stop at the first push because 657 * it counted the urg (amount==1) and then does amount-- 658 * *after* the loop. This means tcp_readable() always 659 * returned zero if any URG PUSH was in the queue, even 660 * though there was normal data available. If we subtract 661 * the urg data right here, we even get it to work for more 662 * than one URG PUSH skb without normal data. 663 * This means that select() finally works now with urg data 664 * in the queue. Note that rlogin was never affected 665 * because it doesn't use select(); it uses two processes 666 * and a blocking read(). And the queue scan in tcp_read() 667 * was correct. Mike <pall@rz.uni-karlsruhe.de> 668 */ 669 if (skb->h.th->urg)
670 amount--; /* don't count urg data */ 671 if (amount && skb->h.th->psh) break;
672 skb = skb->next;
673 } 674 while(skb != (structsk_buff *)&sk->receive_queue);
675
676 restore_flags(flags);
677 if(sk->debug)
678 printk("got %lu bytes.\n",amount);
679 return(amount);
680 } 681
682 /* 683 * LISTEN is a special case for select.. 684 */ 685 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 686 { 687 if (sel_type == SEL_IN) { 688 intretval;
689
690 lock_sock(sk);
691 retval = (tcp_find_established(sk) != NULL);
692 release_sock(sk);
693 if (!retval)
694 select_wait(&master_select_wakeup,wait);
695 returnretval;
696 } 697 return 0;
698 } 699
700
701 /* 702 * Wait for a TCP event. 703 * 704 * Note that we don't need to lock the socket, as the upper select layers 705 * take care of normal races (between the test and the event) and we don't 706 * go look at any of the socket buffers directly. 707 */ 708 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 709 { 710 if (sk->state == TCP_LISTEN)
711 returntcp_listen_select(sk, sel_type, wait);
712
713 switch(sel_type) { 714 caseSEL_IN:
715 if (sk->err)
716 return 1;
717 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
718 break;
719
720 if (sk->shutdown & RCV_SHUTDOWN)
721 return 1;
722
723 if (sk->acked_seq == sk->copied_seq)
724 break;
725
726 if (sk->urg_seq != sk->copied_seq ||
727 sk->acked_seq != sk->copied_seq+1 ||
728 sk->urginline || !sk->urg_data)
729 return 1;
730 break;
731
732 caseSEL_OUT:
733 if (sk->err)
734 return 1;
735 if (sk->shutdown & SEND_SHUTDOWN)
736 return 0;
737 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
738 break;
739 /* 740 * This is now right thanks to a small fix 741 * by Matt Dillon. 742 */ 743
744 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
745 break;
746 return 1;
747
748 caseSEL_EX:
749 if (sk->urg_data)
750 return 1;
751 break;
752 } 753 select_wait(sk->sleep, wait);
754 return 0;
755 } 756
757 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 758 { 759 interr;
760 switch(cmd)
761 { 762
763 caseTIOCINQ:
764 #ifdef FIXME /* FIXME: */ 765 caseFIONREAD:
766 #endif 767 { 768 unsignedlongamount;
769
770 if (sk->state == TCP_LISTEN)
771 return(-EINVAL);
772
773 lock_sock(sk);
774 amount = tcp_readable(sk);
775 release_sock(sk);
776 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
777 if(err)
778 returnerr;
779 put_user(amount, (int *)arg);
780 return(0);
781 } 782 caseSIOCATMARK:
783 { 784 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
785
786 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
787 if (err)
788 returnerr;
789 put_user(answ,(int *) arg);
790 return(0);
791 } 792 caseTIOCOUTQ:
793 { 794 unsignedlongamount;
795
796 if (sk->state == TCP_LISTEN) return(-EINVAL);
797 amount = sock_wspace(sk);
798 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
799 if(err)
800 returnerr;
801 put_user(amount, (int *)arg);
802 return(0);
803 } 804 default:
805 return(-EINVAL);
806 } 807 } 808
809
810 /* 811 * This routine computes a TCP checksum. 812 * 813 * Modified January 1995 from a go-faster DOS routine by 814 * Jorge Cwik <jorge@laser.satlink.net> 815 */ 816 #undefDEBUG_TCP_CHECK 817 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */ 818 unsignedlongdaddr, intlen, structsk_buff *skb)
819 { 820 #ifdefDEBUG_TCP_CHECK 821 u16check;
822 #endif 823 th->check = 0;
824 th->check = tcp_check(th, len, saddr, daddr,
825 csum_partial((char *)th,sizeof(*th),skb->csum));
826
827 #ifdefDEBUG_TCP_CHECK 828 check = th->check;
829 th->check = 0;
830 th->check = tcp_check(th, len, saddr, daddr,
831 csum_partial((char *)th,len,0));
832 if (check != th->check) { 833 staticintcount = 0;
834 if (++count < 10) { 835 printk("Checksum %x (%x) from %p\n", th->check, check,
836 (&th)[-1]);
837 printk("TCP=<off:%d a:%d s:%d f:%d>\n", th->doff*4, th->ack, th->syn, th->fin);
838 } 839 } 840 #endif 841 } 842
843
844 /* 845 * This routine builds a generic TCP header. 846 */ 847
848 staticinlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */ 849 { 850 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
851 th->psh = (push == 0) ? 1 : 0;
852 th->seq = htonl(sk->write_seq);
853 th->ack_seq = htonl(sk->acked_seq);
854 th->window = htons(tcp_select_window(sk));
855
856 return(sizeof(*th));
857 } 858
859 /* 860 * Wait for a socket to get into the connected state 861 */ 862 staticvoidwait_for_tcp_connect(structsock * sk)
/* */ 863 { 864 release_sock(sk);
865 cli();
866 if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0)
867 { 868 interruptible_sleep_on(sk->sleep);
869 } 870 sti();
871 lock_sock(sk);
872 } 873
874 /* 875 * Wait for more memory for a socket 876 */ 877 staticvoidwait_for_tcp_memory(structsock * sk)
/* */ 878 { 879 release_sock(sk);
880 cli();
881 if (sk->wmem_alloc*2 > sk->sndbuf &&
882 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
883 && sk->err == 0)
884 { 885 sk->socket->flags &= ~SO_NOSPACE;
886 interruptible_sleep_on(sk->sleep);
887 } 888 sti();
889 lock_sock(sk);
890 } 891
892
893 /* 894 * This routine copies from a user buffer into a socket, 895 * and starts the transmit system. 896 */ 897
898 staticintdo_tcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */ 899 intlen, intnonblock, intflags)
900 { 901 intcopied = 0;
902 intcopy;
903 inttmp;
904 intseglen;
905 intiovct=0;
906 structsk_buff *skb;
907 structsk_buff *send_tmp;
908 structproto *prot;
909 structdevice *dev = NULL;
910 unsignedchar *from;
911
912 /* 913 * Ok commence sending 914 */ 915
916 while(iovct<msg->msg_iovlen)
917 { 918 seglen=msg->msg_iov[iovct].iov_len;
919 from=msg->msg_iov[iovct++].iov_base;
920 prot = sk->prot;
921 while(seglen > 0)
922 { 923 /* 924 * Stop on errors 925 */ 926 if (sk->err)
927 { 928 if (copied)
929 returncopied;
930 returnsock_error(sk);
931 } 932
933 /* 934 * Make sure that we are established. 935 */ 936 if (sk->shutdown & SEND_SHUTDOWN)
937 { 938 if (copied)
939 returncopied;
940 return -EPIPE;
941 } 942
943 /* 944 * Wait for a connection to finish. 945 */ 946 while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
947 { 948 if (copied)
949 returncopied;
950
951 if (sk->err)
952 returnsock_error(sk);
953
954 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
955 { 956 if (sk->keepopen)
957 send_sig(SIGPIPE, current, 0);
958 return -EPIPE;
959 } 960
961 if (nonblock)
962 return -EAGAIN;
963
964 if (current->signal & ~current->blocked)
965 return -ERESTARTSYS;
966
967 wait_for_tcp_connect(sk);
968 } 969
970 /* 971 * The following code can result in copy <= if sk->mss is ever 972 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window). 973 * sk->mtu is constant once SYN processing is finished. I.e. we 974 * had better not get here until we've seen his SYN and at least one 975 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.) 976 * But ESTABLISHED should guarantee that. sk->max_window is by definition 977 * non-decreasing. Note that any ioctl to set user_mss must be done 978 * before the exchange of SYN's. If the initial ack from the other 979 * end has a window of 0, max_window and thus mss will both be 0. 980 */ 981
982 /* 983 * Now we need to check if we have a half built packet. 984 */ 985 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 986 /* 987 * FIXME: I'm almost sure that this fragment is BUG, 988 * but it works... I do not know why 8) --ANK 989 * 990 * Really, we should rebuild all the queues... 991 * It's difficult. Temporary hack is to send all 992 * queued segments with allowed fragmentation. 993 */ 994 { 995 intnew_mss = min(sk->mtu, sk->max_window);
996 if (new_mss < sk->mss)
997 { 998 tcp_send_partial(sk);
999 sk->mss = new_mss;
1000 }1001 }1002 #endif1003
1004 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1005 {1006 inttcp_size;
1007
1008 tcp_size = skb->tail - (unsignedchar *)(skb->h.th + 1);
1009
1010 /* Add more stuff to the end of skb->len */1011 if (!(flags & MSG_OOB))
1012 {1013 copy = min(sk->mss - tcp_size, seglen);
1014 if (copy <= 0)
1015 {1016 printk("TCP: **bug**: \"copy\" <= 0\n");
1017 return -EFAULT;
1018 }1019 tcp_size += copy;
1020 memcpy_fromfs(skb_put(skb,copy), from, copy);
1021 skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
1022 from += copy;
1023 copied += copy;
1024 len -= copy;
1025 sk->write_seq += copy;
1026 seglen -= copy;
1027 }1028 if (tcp_size >= sk->mss || (flags & MSG_OOB) || !sk->packets_out)
1029 tcp_send_skb(sk, skb);
1030 else1031 tcp_enqueue_partial(skb, sk);
1032 continue;
1033 }1034
1035 /*1036 * We also need to worry about the window.1037 * If window < 1/2 the maximum window we've seen from this1038 * host, don't use it. This is sender side1039 * silly window prevention, as specified in RFC1122.1040 * (Note that this is different than earlier versions of1041 * SWS prevention, e.g. RFC813.). What we actually do is1042 * use the whole MSS. Since the results in the right1043 * edge of the packet being outside the window, it will1044 * be queued for later rather than sent.1045 */1046
1047 copy = sk->window_seq - sk->write_seq;
1048 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1049 copy = sk->mss;
1050 if (copy > seglen)
1051 copy = seglen;
1052 if (copy <= 0)
1053 {1054 printk("TCP: **bug**: copy=%d, sk->mss=%d\n", copy, sk->mss);
1055 return -EFAULT;
1056 }1057
1058 /*1059 * We should really check the window here also.1060 */1061
1062 send_tmp = NULL;
1063 if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out)
1064 {1065 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1066 send_tmp = skb;
1067 }1068 else1069 {1070 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1071 }1072
1073 /*1074 * If we didn't get any memory, we need to sleep.1075 */1076
1077 if (skb == NULL)
1078 {1079 sk->socket->flags |= SO_NOSPACE;
1080 if (nonblock)
1081 {1082 if (copied)
1083 returncopied;
1084 return -EAGAIN;
1085 }1086
1087 if (current->signal & ~current->blocked)
1088 {1089 if (copied)
1090 returncopied;
1091 return -ERESTARTSYS;
1092 }1093
1094 wait_for_tcp_memory(sk);
1095 continue;
1096 }1097
1098 skb->sk = sk;
1099 skb->free = 0;
1100 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1101
1102 /*1103 * FIXME: we need to optimize this.1104 * Perhaps some hints here would be good.1105 */1106
1107 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1108 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1109 if (tmp < 0 )
1110 {1111 sock_wfree(sk, skb);
1112 if (copied)
1113 return(copied);
1114 return(tmp);
1115 }1116 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1117 skb->ip_hdr->frag_off |= htons(IP_DF);
1118 #endif1119 skb->dev = dev;
1120 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
1121 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1122 if (tmp < 0)
1123 {1124 sock_wfree(sk, skb);
1125 if (copied)
1126 return(copied);
1127 return(tmp);
1128 }1129
1130 if (flags & MSG_OOB)
1131 {1132 skb->h.th->urg = 1;
1133 skb->h.th->urg_ptr = ntohs(copy);
1134 }1135
1136 skb->csum = csum_partial_copy_fromuser(from,
1137 skb_put(skb,copy), copy, 0);
1138
1139 from += copy;
1140 copied += copy;
1141 len -= copy;
1142 seglen -= copy;
1143 skb->free = 0;
1144 sk->write_seq += copy;
1145
1146 if (send_tmp != NULL)
1147 {1148 tcp_enqueue_partial(send_tmp, sk);
1149 continue;
1150 }1151 tcp_send_skb(sk, skb);
1152 }1153 }1154 sk->err = 0;
1155
1156 returncopied;
1157 }1158
1159
1160 staticinttcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */1161 intlen, intnonblock, intflags)
1162 {1163 intretval = -EINVAL;
1164
1165 /*1166 * Do sanity checking for sendmsg/sendto/send1167 */1168
1169 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1170 gotoout;
1171 if (msg->msg_name) {1172 structsockaddr_in *addr=(structsockaddr_in *)msg->msg_name;
1173
1174 if (msg->msg_namelen < sizeof(*addr))
1175 gotoout;
1176 if (addr->sin_family && addr->sin_family != AF_INET)
1177 gotoout;
1178 retval = -ENOTCONN;
1179 if(sk->state == TCP_CLOSE)
1180 gotoout;
1181 retval = -EISCONN;
1182 if (addr->sin_port != sk->dummy_th.dest)
1183 gotoout;
1184 if (addr->sin_addr.s_addr != sk->daddr)
1185 gotoout;
1186 }1187
1188 lock_sock(sk);
1189 retval = do_tcp_sendmsg(sk, msg, len, nonblock, flags);
1190
1191 /*1192 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1193 * interactive fast network servers. It's meant to be on and1194 * it really improves the throughput though not the echo time1195 * on my slow slip link - Alan1196 *1197 * If not nagling we can send on the before case too..1198 */1199
1200 if (sk->partial) {1201 if (!sk->packets_out ||
1202 (sk->nonagle && before(sk->write_seq , sk->window_seq))) {1203 tcp_send_partial(sk);
1204 }1205 }1206
1207 release_sock(sk);
1208
1209 out:
1210 returnretval;
1211 }1212
1213
1214 /*1215 * Send an ack if one is backlogged at this point.1216 *1217 * This is called for delayed acks also.1218 */1219
1220 voidtcp_read_wakeup(structsock *sk)
/* */1221 {1222 if (!sk->ack_backlog)
1223 return;
1224
1225 /*1226 * If we're closed, don't send an ack, or we'll get a RST1227 * from the closed destination.1228 */1229 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1230 return;
1231
1232 tcp_send_ack(sk);
1233 }1234
1235
1236 /*1237 * Handle reading urgent data. BSD has very simple semantics for1238 * this, no blocking and very strange errors 8)1239 */1240
1241 staticinttcp_recv_urg(structsock * sk, intnonblock,
/* */1242 structmsghdr *msg, intlen, intflags, int *addr_len)
1243 {1244 /*1245 * No URG data to read1246 */1247 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1248 return -EINVAL; /* Yes this is right ! */1249
1250 if (sk->err)
1251 returnsock_error(sk);
1252
1253 if (sk->state == TCP_CLOSE || sk->done)
1254 {1255 if (!sk->done)
1256 {1257 sk->done = 1;
1258 return 0;
1259 }1260 return -ENOTCONN;
1261 }1262
1263 if (sk->shutdown & RCV_SHUTDOWN)
1264 {1265 sk->done = 1;
1266 return 0;
1267 }1268 lock_sock(sk);
1269 if (sk->urg_data & URG_VALID)
1270 {1271 charc = sk->urg_data;
1272 if (!(flags & MSG_PEEK))
1273 sk->urg_data = URG_READ;
1274 memcpy_toiovec(msg->msg_iov, &c, 1);
1275 if(msg->msg_name)
1276 {1277 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
1278 sin->sin_family=AF_INET;
1279 sin->sin_addr.s_addr=sk->daddr;
1280 sin->sin_port=sk->dummy_th.dest;
1281 }1282 if(addr_len)
1283 *addr_len=sizeof(structsockaddr_in);
1284 release_sock(sk);
1285 return 1;
1286 }1287 release_sock(sk);
1288
1289 /*1290 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and1291 * the available implementations agree in this case:1292 * this call should never block, independent of the1293 * blocking state of the socket.1294 * Mike <pall@rz.uni-karlsruhe.de>1295 */1296 return -EAGAIN;
1297 }1298
1299 /*1300 * Release a skb if it is no longer needed. This routine1301 * must be called with interrupts disabled or with the1302 * socket locked so that the sk_buff queue operation is ok.1303 */1304
1305 staticinlinevoidtcp_eat_skb(structsock *sk, structsk_buff * skb)
/* */1306 {1307 skb->sk = sk;
1308 __skb_unlink(skb, &sk->receive_queue);
1309 kfree_skb(skb, FREE_READ);
1310 }1311
1312 /*1313 * FIXME:1314 * This routine frees used buffers.1315 * It should consider sending an ACK to let the1316 * other end know we now have a bigger window.1317 */1318
1319 staticvoidcleanup_rbuf(structsock *sk)
/* */1320 {1321 /*1322 * NOTE! The socket must be locked, so that we don't get1323 * a messed-up receive queue.1324 */1325 while (!skb_queue_empty(&sk->receive_queue)) {1326 structsk_buff *skb = sk->receive_queue.next;
1327 if (!skb->used || skb->users)
1328 break;
1329 tcp_eat_skb(sk, skb);
1330 }1331
1332 /*1333 * Tell the world if we raised the window.1334 */1335 if (tcp_raise_window(sk))
1336 tcp_send_ack(sk);
1337 }1338
1339
1340 /*1341 * This routine copies from a sock struct into the user buffer.1342 */1343
1344 staticinttcp_recvmsg(structsock *sk, structmsghdr *msg,
/* */1345 intlen, intnonblock, intflags, int *addr_len)
1346 {1347 structwait_queuewait = {current, NULL};
1348 intcopied = 0;
1349 u32peek_seq;
1350 volatileu32 *seq; /* So gcc doesn't overoptimise */1351 unsignedlongused;
1352
1353 /*1354 * This error should be checked.1355 */1356
1357 if (sk->state == TCP_LISTEN)
1358 return -ENOTCONN;
1359
1360 /*1361 * Urgent data needs to be handled specially.1362 */1363
1364 if (flags & MSG_OOB)
1365 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1366
1367 /*1368 * Copying sequence to update. This is volatile to handle1369 * the multi-reader case neatly (memcpy_to/fromfs might be1370 * inline and thus not flush cached variables otherwise).1371 */1372
1373 peek_seq = sk->copied_seq;
1374 seq = &sk->copied_seq;
1375 if (flags & MSG_PEEK)
1376 seq = &peek_seq;
1377
1378 add_wait_queue(sk->sleep, &wait);
1379 lock_sock(sk);
1380 while (len > 0)
1381 {1382 structsk_buff * skb;
1383 u32offset;
1384
1385 /*1386 * Are we at urgent data? Stop if we have read anything.1387 */1388
1389 if (copied && sk->urg_data && sk->urg_seq == *seq)
1390 break;
1391
1392 /*1393 * We need to check signals first, to get correct SIGURG1394 * handling.1395 */1396 if (current->signal & ~current->blocked) {1397 if (copied)
1398 break;
1399 copied = -ERESTARTSYS;
1400 break;
1401 }1402
1403 /*1404 * Next get a buffer.1405 */1406
1407 current->state = TASK_INTERRUPTIBLE;
1408
1409 skb = skb_peek(&sk->receive_queue);
1410 do1411 {1412 if (!skb)
1413 break;
1414 if (before(*seq, skb->seq))
1415 break;
1416 offset = *seq - skb->seq;
1417 if (skb->h.th->syn)
1418 offset--;
1419 if (offset < skb->len)
1420 gotofound_ok_skb;
1421 if (skb->h.th->fin)
1422 gotofound_fin_ok;
1423 if (!(flags & MSG_PEEK))
1424 skb->used = 1;
1425 skb = skb->next;
1426 }1427 while (skb != (structsk_buff *)&sk->receive_queue);
1428
1429 if (copied)
1430 break;
1431
1432 if (sk->err)
1433 {1434 copied = sock_error(sk);
1435 break;
1436 }1437
1438 if (sk->state == TCP_CLOSE)
1439 {1440 if (!sk->done)
1441 {1442 sk->done = 1;
1443 break;
1444 }1445 copied = -ENOTCONN;
1446 break;
1447 }1448
1449 if (sk->shutdown & RCV_SHUTDOWN)
1450 {1451 sk->done = 1;
1452 break;
1453 }1454
1455 if (nonblock)
1456 {1457 copied = -EAGAIN;
1458 break;
1459 }1460
1461 cleanup_rbuf(sk);
1462 release_sock(sk);
1463 sk->socket->flags |= SO_WAITDATA;
1464 schedule();
1465 sk->socket->flags &= ~SO_WAITDATA;
1466 lock_sock(sk);
1467 continue;
1468
1469 found_ok_skb:
1470 /*1471 * Lock the buffer. We can be fairly relaxed as1472 * an interrupt will never steal a buffer we are1473 * using unless I've missed something serious in1474 * tcp_data.1475 */1476
1477 skb->users++;
1478
1479 /*1480 * Ok so how much can we use ?1481 */1482
1483 used = skb->len - offset;
1484 if (len < used)
1485 used = len;
1486 /*1487 * Do we have urgent data here?1488 */1489
1490 if (sk->urg_data)
1491 {1492 u32urg_offset = sk->urg_seq - *seq;
1493 if (urg_offset < used)
1494 {1495 if (!urg_offset)
1496 {1497 if (!sk->urginline)
1498 {1499 ++*seq;
1500 offset++;
1501 used--;
1502 }1503 }1504 else1505 used = urg_offset;
1506 }1507 }1508
1509 /*1510 * Copy it - We _MUST_ update *seq first so that we1511 * don't ever double read when we have dual readers1512 */1513
1514 *seq += used;
1515
1516 /*1517 * This memcpy_tofs can sleep. If it sleeps and we1518 * do a second read it relies on the skb->users to avoid1519 * a crash when cleanup_rbuf() gets called.1520 */1521
1522 memcpy_toiovec(msg->msg_iov,((unsignedchar *)skb->h.th) +
1523 skb->h.th->doff*4 + offset, used);
1524 copied += used;
1525 len -= used;
1526
1527 /*1528 * We now will not sleep again until we are finished1529 * with skb. Sorry if you are doing the SMP port1530 * but you'll just have to fix it neatly ;)1531 */1532
1533 skb->users --;
1534
1535 if (after(sk->copied_seq,sk->urg_seq))
1536 sk->urg_data = 0;
1537 if (used + offset < skb->len)
1538 continue;
1539
1540 /*1541 * Process the FIN.1542 */1543
1544 if (skb->h.th->fin)
1545 gotofound_fin_ok;
1546 if (flags & MSG_PEEK)
1547 continue;
1548 skb->used = 1;
1549 if (!skb->users)
1550 tcp_eat_skb(sk, skb);
1551 continue;
1552
1553 found_fin_ok:
1554 ++*seq;
1555 if (flags & MSG_PEEK)
1556 break;
1557
1558 /*1559 * All is done1560 */1561
1562 skb->used = 1;
1563 sk->shutdown |= RCV_SHUTDOWN;
1564 break;
1565
1566 }1567
1568 if(copied>0 && msg->msg_name)
1569 {1570 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
1571 sin->sin_family=AF_INET;
1572 sin->sin_addr.s_addr=sk->daddr;
1573 sin->sin_port=sk->dummy_th.dest;
1574 }1575 if(addr_len)
1576 *addr_len=sizeof(structsockaddr_in);
1577
1578 remove_wait_queue(sk->sleep, &wait);
1579 current->state = TASK_RUNNING;
1580
1581 /* Clean up data we have read: This will do ACK frames */1582 cleanup_rbuf(sk);
1583 release_sock(sk);
1584 returncopied;
1585 }1586
1587
1588
1589 /*1590 * State processing on a close. This implements the state shift for1591 * sending our FIN frame. Note that we only send a FIN for some1592 * states. A shutdown() may have already sent the FIN, or we may be1593 * closed.1594 */1595
1596 staticinttcp_close_state(structsock *sk, intdead)
/* */1597 {1598 intns=TCP_CLOSE;
1599 intsend_fin=0;
1600 switch(sk->state)
1601 {1602 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */1603 break;
1604 caseTCP_SYN_RECV:
1605 caseTCP_ESTABLISHED: /* Closedown begin */1606 ns=TCP_FIN_WAIT1;
1607 send_fin=1;
1608 break;
1609 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */1610 caseTCP_FIN_WAIT2:
1611 caseTCP_CLOSING:
1612 ns=sk->state;
1613 break;
1614 caseTCP_CLOSE:
1615 caseTCP_LISTEN:
1616 break;
1617 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and1618 wait only for the ACK */1619 ns=TCP_LAST_ACK;
1620 send_fin=1;
1621 }1622
1623 tcp_set_state(sk,ns);
1624
1625 /*1626 * This is a (useful) BSD violating of the RFC. There is a1627 * problem with TCP as specified in that the other end could1628 * keep a socket open forever with no application left this end.1629 * We use a 3 minute timeout (about the same as BSD) then kill1630 * our end. If they send after that then tough - BUT: long enough1631 * that we won't make the old 4*rto = almost no time - whoops1632 * reset mistake.1633 */1634 if(dead && ns==TCP_FIN_WAIT2)
1635 {1636 inttimer_active=del_timer(&sk->timer);
1637 if(timer_active)
1638 add_timer(&sk->timer);
1639 else1640 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
1641 }1642
1643 returnsend_fin;
1644 }1645
1646 /*1647 * Shutdown the sending side of a connection. Much like close except1648 * that we don't receive shut down or set sk->dead.1649 */1650
1651 voidtcp_shutdown(structsock *sk, inthow)
/* */1652 {1653 /*1654 * We need to grab some memory, and put together a FIN,1655 * and then put it into the queue to be sent.1656 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.1657 */1658
1659 if (!(how & SEND_SHUTDOWN))
1660 return;
1661
1662 /*1663 * If we've already sent a FIN, or it's a closed state1664 */1665
1666 if (sk->state == TCP_FIN_WAIT1 ||
1667 sk->state == TCP_FIN_WAIT2 ||
1668 sk->state == TCP_CLOSING ||
1669 sk->state == TCP_LAST_ACK ||
1670 sk->state == TCP_TIME_WAIT ||
1671 sk->state == TCP_CLOSE ||
1672 sk->state == TCP_LISTEN1673 )
1674 {1675 return;
1676 }1677 lock_sock(sk);
1678
1679 /*1680 * flag that the sender has shutdown1681 */1682
1683 sk->shutdown |= SEND_SHUTDOWN;
1684
1685 /*1686 * Clear out any half completed packets.1687 */1688
1689 if (sk->partial)
1690 tcp_send_partial(sk);
1691
1692 /*1693 * FIN if needed1694 */1695
1696 if (tcp_close_state(sk,0))
1697 tcp_send_fin(sk);
1698
1699 release_sock(sk);
1700 }1701
1702
1703 /*1704 * Return 1 if we still have things to send in our buffers.1705 */1706
1707 staticinlineintclosing(structsock * sk)
/* */1708 {1709 switch (sk->state) {1710 caseTCP_FIN_WAIT1:
1711 caseTCP_CLOSING:
1712 caseTCP_LAST_ACK:
1713 return 1;
1714 }1715 return 0;
1716 }1717
1718
1719 staticvoidtcp_close(structsock *sk, unsignedlongtimeout)
/* */1720 {1721 structsk_buff *skb;
1722
1723 /*1724 * We need to grab some memory, and put together a FIN,1725 * and then put it into the queue to be sent.1726 */1727
1728 lock_sock(sk);
1729
1730 tcp_cache_zap();
1731 if(sk->state == TCP_LISTEN)
1732 {1733 /* Special case */1734 tcp_set_state(sk, TCP_CLOSE);
1735 tcp_close_pending(sk);
1736 release_sock(sk);
1737 sk->dead = 1;
1738 return;
1739 }1740
1741 sk->keepopen = 1;
1742 sk->shutdown = SHUTDOWN_MASK;
1743
1744 if (!sk->dead)
1745 sk->state_change(sk);
1746
1747 /*1748 * We need to flush the recv. buffs. We do this only on the1749 * descriptor close, not protocol-sourced closes, because the1750 * reader process may not have drained the data yet!1751 */1752
1753 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
1754 kfree_skb(skb, FREE_READ);
1755
1756 /*1757 * Get rid off any half-completed packets.1758 */1759
1760 if (sk->partial)
1761 tcp_send_partial(sk);
1762
1763 /*1764 * Timeout is not the same thing - however the code likes1765 * to send both the same way (sigh).1766 */1767
1768 if (tcp_close_state(sk,1)==1)
1769 {1770 tcp_send_fin(sk);
1771 }1772
1773 if (timeout) {1774 cli();
1775 release_sock(sk);
1776 current->timeout = timeout;
1777 while(closing(sk) && current->timeout)
1778 {1779 interruptible_sleep_on(sk->sleep);
1780 if (current->signal & ~current->blocked)
1781 {1782 break;
1783 }1784 }1785 current->timeout=0;
1786 lock_sock(sk);
1787 sti();
1788 }1789
1790 /*1791 * This will destroy it. The timers will take care of actually1792 * free'ing up the memory.1793 */1794 tcp_cache_zap(); /* Kill the cache again. */1795 release_sock(sk);
1796 sk->dead = 1;
1797 }1798
1799
1800 /*1801 * This will accept the next outstanding connection.1802 */1803
1804 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */1805 {1806 structsock *newsk;
1807 structsk_buff *skb;
1808
1809 /*1810 * We need to make sure that this socket is listening,1811 * and that it has something pending.1812 */1813
1814 if (sk->state != TCP_LISTEN)
1815 {1816 sk->err = EINVAL;
1817 return(NULL);
1818 }1819
1820 /* Avoid the race. */1821 cli();
1822 lock_sock(sk);
1823
1824 while((skb = tcp_dequeue_established(sk)) == NULL)
1825 {1826 if (flags & O_NONBLOCK)
1827 {1828 sti();
1829 release_sock(sk);
1830 sk->err = EAGAIN;
1831 return(NULL);
1832 }1833
1834 release_sock(sk);
1835 interruptible_sleep_on(sk->sleep);
1836 if (current->signal & ~current->blocked)
1837 {1838 sti();
1839 sk->err = ERESTARTSYS;
1840 return(NULL);
1841 }1842 lock_sock(sk);
1843 }1844 sti();
1845
1846 /*1847 * Now all we need to do is return skb->sk.1848 */1849
1850 newsk = skb->sk;
1851
1852 kfree_skb(skb, FREE_READ);
1853 sk->ack_backlog--;
1854 release_sock(sk);
1855 return(newsk);
1856 }1857
1858 /*1859 * This will initiate an outgoing connection.1860 */1861
1862 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */1863 {1864 structsk_buff *buff;
1865 structdevice *dev=NULL;
1866 unsignedchar *ptr;
1867 inttmp;
1868 intatype;
1869 structtcphdr *t1;
1870 structrtable *rt;
1871
1872 if (sk->state != TCP_CLOSE)
1873 return(-EISCONN);
1874
1875 /*1876 * Don't allow a double connect.1877 */1878
1879 if(sk->daddr)
1880 return -EINVAL;
1881
1882 if (addr_len < 8)
1883 return(-EINVAL);
1884
1885 if (usin->sin_family && usin->sin_family != AF_INET)
1886 return(-EAFNOSUPPORT);
1887
1888 /*1889 * connect() to INADDR_ANY means loopback (BSD'ism).1890 */1891
1892 if(usin->sin_addr.s_addr==INADDR_ANY)
1893 usin->sin_addr.s_addr=ip_my_addr();
1894
1895 /*1896 * Don't want a TCP connection going to a broadcast address1897 */1898
1899 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
1900 return -ENETUNREACH;
1901
1902 lock_sock(sk);
1903 sk->daddr = usin->sin_addr.s_addr;
1904 sk->write_seq = tcp_init_seq();
1905 sk->window_seq = sk->write_seq;
1906 sk->rcv_ack_seq = sk->write_seq -1;
1907 sk->rcv_ack_cnt = 1;
1908 sk->err = 0;
1909 sk->dummy_th.dest = usin->sin_port;
1910 release_sock(sk);
1911
1912 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
1913 if (buff == NULL)
1914 {1915 return(-ENOMEM);
1916 }1917 lock_sock(sk);
1918 buff->sk = sk;
1919 buff->free = 0;
1920 buff->localroute = sk->localroute;
1921
1922
1923 /*1924 * Put in the IP header and routing stuff.1925 */1926
1927 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1928 IPPROTO_TCP, sk->opt, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1929 if (tmp < 0)
1930 {1931 sock_wfree(sk, buff);
1932 release_sock(sk);
1933 return(-ENETUNREACH);
1934 }1935 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
1936 sk->saddr = rt->rt_src;
1937 sk->rcv_saddr = sk->saddr;
1938
1939 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
1940
1941 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
1942 buff->seq = sk->write_seq++;
1943 t1->seq = htonl(buff->seq);
1944 sk->sent_seq = sk->write_seq;
1945 buff->end_seq = sk->write_seq;
1946 t1->ack = 0;
1947 t1->window = 2;
1948 t1->syn = 1;
1949 t1->doff = 6;
1950 /* use 512 or whatever user asked for */1951
1952 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
1953 sk->window_clamp=rt->rt_window;
1954 else1955 sk->window_clamp=0;
1956
1957 if (sk->user_mss)
1958 sk->mtu = sk->user_mss;
1959 elseif (rt)
1960 sk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
1961 else1962 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
1963
1964 /*1965 * but not bigger than device MTU1966 */1967
1968 if(sk->mtu <32)
1969 sk->mtu = 32; /* Sanity limit */1970
1971 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
1972
1973 #ifdefCONFIG_SKIP1974
1975 /*1976 * SKIP devices set their MTU to 65535. This is so they can take packets1977 * unfragmented to security process then fragment. They could lie to the1978 * TCP layer about a suitable MTU, but its easier to let skip sort it out1979 * simply because the final package we want unfragmented is going to be1980 *1981 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]1982 */1983
1984 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */1985 sk->mtu=skip_pick_mtu(sk->mtu,dev);
1986 #endif1987
1988 /*1989 * Put in the TCP options to say MTU.1990 */1991
1992 ptr = skb_put(buff,4);
1993 ptr[0] = 2;
1994 ptr[1] = 4;
1995 ptr[2] = (sk->mtu) >> 8;
1996 ptr[3] = (sk->mtu) & 0xff;
1997 buff->csum = csum_partial(ptr, 4, 0);
1998 tcp_send_check(t1, sk->saddr, sk->daddr,
1999 sizeof(structtcphdr) + 4, buff);
2000
2001 /*2002 * This must go first otherwise a really quick response will get reset.2003 */2004
2005 tcp_cache_zap();
2006 tcp_set_state(sk,TCP_SYN_SENT);
2007 if(rt&&rt->rt_flags&RTF_IRTT)
2008 sk->rto = rt->rt_irtt;
2009 else2010 sk->rto = TCP_TIMEOUT_INIT;
2011 sk->delack_timer.function = tcp_delack_timer;
2012 sk->delack_timer.data = (unsignedlong) sk;
2013 sk->retransmit_timer.function = tcp_retransmit_timer;
2014 sk->retransmit_timer.data = (unsignedlong)sk;
2015 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */2016 sk->retransmits = 0; /* Now works the right way instead of a hacked2017 initial setting */2018
2019 sk->prot->queue_xmit(sk, dev, buff, 0);
2020 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2021 tcp_statistics.TcpActiveOpens++;
2022 tcp_statistics.TcpOutSegs++;
2023
2024 release_sock(sk);
2025 return(0);
2026 }2027
2028 /*2029 * Socket option code for TCP.2030 */2031
2032 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */2033 {2034 intval,err;
2035
2036 if(level!=SOL_TCP)
2037 returnip_setsockopt(sk,level,optname,optval,optlen);
2038
2039 if (optval == NULL)
2040 return(-EINVAL);
2041
2042 err=verify_area(VERIFY_READ, optval, sizeof(int));
2043 if(err)
2044 returnerr;
2045
2046 val = get_user((int *)optval);
2047
2048 switch(optname)
2049 {2050 caseTCP_MAXSEG:
2051 /*2052 * values greater than interface MTU won't take effect. however at2053 * the point when this call is done we typically don't yet know2054 * which interface is going to be used2055 */2056 if(val<1||val>MAX_WINDOW)
2057 return -EINVAL;
2058 sk->user_mss=val;
2059 return 0;
2060 caseTCP_NODELAY:
2061 sk->nonagle=(val==0)?0:1;
2062 return 0;
2063 default:
2064 return(-ENOPROTOOPT);
2065 }2066 }2067
2068 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */2069 {2070 intval,err;
2071
2072 if(level!=SOL_TCP)
2073 returnip_getsockopt(sk,level,optname,optval,optlen);
2074
2075 switch(optname)
2076 {2077 caseTCP_MAXSEG:
2078 val=sk->user_mss;
2079 break;
2080 caseTCP_NODELAY:
2081 val=sk->nonagle;
2082 break;
2083 default:
2084 return(-ENOPROTOOPT);
2085 }2086 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2087 if(err)
2088 returnerr;
2089 put_user(sizeof(int),(int *) optlen);
2090
2091 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
2092 if(err)
2093 returnerr;
2094 put_user(val,(int *)optval);
2095
2096 return(0);
2097 }2098
2099
2100 structprototcp_prot = {2101 tcp_close,
2102 ip_build_header,
2103 tcp_connect,
2104 tcp_accept,
2105 ip_queue_xmit,
2106 tcp_retransmit,
2107 tcp_write_wakeup,
2108 tcp_read_wakeup,
2109 tcp_rcv,
2110 tcp_select,
2111 tcp_ioctl,
2112 NULL,
2113 tcp_shutdown,
2114 tcp_setsockopt,
2115 tcp_getsockopt,
2116 tcp_sendmsg,
2117 tcp_recvmsg,
2118 NULL, /* No special bind() */2119 128,
2120 0,
2121 "TCP",
2122 0, 0,
2123 {NULL,}2124 };