1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications. 178 * Alan Cox : rcv_saddr errors. 179 * Alan Cox : Block double connect(). 180 * Alan Cox : Small hooks for enSKIP. 181 * Alexey Kuznetsov: Path MTU discovery. 182 * Alan Cox : Support soft errors. 183 * 184 * 185 * To Fix: 186 * Fast path the code. Two things here - fix the window calculation 187 * so it doesn't iterate over the queue, also spot packets with no funny 188 * options arriving in order and process directly. 189 * 190 * Rewrite output state machine to use a single queue and do low window 191 * situations as per the spec (RFC 1122) 192 * Speed up input assembly algorithm. 193 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 194 * could do with it working on IPv4 195 * User settable/learned rtt/max window/mtu 196 * Fix the window handling to use PR's new code. 197 * 198 * Change the fundamental structure to a single send queue maintained 199 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 200 * active routes too]). Cut the queue off in tcp_retransmit/ 201 * tcp_transmit. 202 * Change the receive queue to assemble as it goes. This lets us 203 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 204 * tcp_data/tcp_read as well as the window shrink crud. 205 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 206 * tcp_queue_skb seem obvious routines to extract. 207 * 208 * This program is free software; you can redistribute it and/or 209 * modify it under the terms of the GNU General Public License 210 * as published by the Free Software Foundation; either version 211 * 2 of the License, or(at your option) any later version. 212 * 213 * Description of States: 214 * 215 * TCP_SYN_SENT sent a connection request, waiting for ack 216 * 217 * TCP_SYN_RECV received a connection request, sent ack, 218 * waiting for final ack in three-way handshake. 219 * 220 * TCP_ESTABLISHED connection established 221 * 222 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 223 * transmission of remaining buffered data 224 * 225 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 226 * to shutdown 227 * 228 * TCP_CLOSING both sides have shutdown but we still have 229 * data we have to finish sending 230 * 231 * TCP_TIME_WAIT timeout to catch resent junk before entering 232 * closed, can only be entered from FIN_WAIT2 233 * or CLOSING. Required because the other end 234 * may not have gotten our last ACK causing it 235 * to retransmit the data packet (which we ignore) 236 * 237 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 238 * us to finish writing our data and to shutdown 239 * (we have to close() to move on to LAST_ACK) 240 * 241 * TCP_LAST_ACK out side has shutdown after remote has 242 * shutdown. There may still be data in our 243 * buffer that we have to finish sending 244 * 245 * TCP_CLOSE socket is finished 246 */ 247
248 /* 249 * RFC1122 status: 250 * NOTE: I'm not going to be doing comments in the code for this one except 251 * for violations and the like. tcp.c is just too big... If I say something 252 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 253 * with Alan. -- MS 950903 254 * 255 * Use of PSH (4.2.2.2) 256 * MAY aggregate data sent without the PSH flag. (does) 257 * MAY queue data recieved without the PSH flag. (does) 258 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 259 * MAY implement PSH on send calls. (doesn't, thus:) 260 * MUST NOT buffer data indefinitely (doesn't [1 second]) 261 * MUST set PSH on last segment (does) 262 * MAY pass received PSH to application layer (doesn't) 263 * SHOULD send maximum-sized segment whenever possible. (almost always does) 264 * 265 * Window Size (4.2.2.3, 4.2.2.16) 266 * MUST treat window size as an unsigned number (does) 267 * SHOULD treat window size as a 32-bit number (does not) 268 * MUST NOT shrink window once it is offered (does not normally) 269 * 270 * Urgent Pointer (4.2.2.4) 271 * **MUST point urgent pointer to last byte of urgent data (not right 272 * after). (doesn't, to be like BSD) 273 * MUST inform application layer asynchronously of incoming urgent 274 * data. (does) 275 * MUST provide application with means of determining the amount of 276 * urgent data pending. (does) 277 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 278 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 279 * [Follows BSD 1 byte of urgent data] 280 * 281 * TCP Options (4.2.2.5) 282 * MUST be able to recieve TCP options in any segment. (does) 283 * MUST ignore unsupported options (does) 284 * 285 * Maximum Segment Size Option (4.2.2.6) 286 * MUST implement both sending and receiving MSS. (does) 287 * SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send 288 * it always). (does, even when MSS == 536, which is legal) 289 * MUST assume MSS == 536 if no MSS received at connection setup (does) 290 * MUST calculate "effective send MSS" correctly: 291 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 292 * (does - but allows operator override) 293 * 294 * TCP Checksum (4.2.2.7) 295 * MUST generate and check TCP checksum. (does) 296 * 297 * Initial Sequence Number Selection (4.2.2.8) 298 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 299 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 300 * necessary for 10Mbps networks - and harder than BSD to spoof!) 301 * 302 * Simultaneous Open Attempts (4.2.2.10) 303 * MUST support simultaneous open attempts (does) 304 * 305 * Recovery from Old Duplicate SYN (4.2.2.11) 306 * MUST keep track of active vs. passive open (does) 307 * 308 * RST segment (4.2.2.12) 309 * SHOULD allow an RST segment to contain data (does, but doesn't do 310 * anything with it, which is standard) 311 * 312 * Closing a Connection (4.2.2.13) 313 * MUST inform application of whether connectin was closed by RST or 314 * normal close. (does) 315 * MAY allow "half-duplex" close (treat connection as closed for the 316 * local app, even before handshake is done). (does) 317 * MUST linger in TIME_WAIT for 2 * MSL (does) 318 * 319 * Retransmission Timeout (4.2.2.15) 320 * MUST implement Jacobson's slow start and congestion avoidance 321 * stuff. (does) 322 * 323 * Probing Zero Windows (4.2.2.17) 324 * MUST support probing of zero windows. (does) 325 * MAY keep offered window closed indefinitely. (does) 326 * MUST allow remote window to stay closed indefinitely. (does) 327 * 328 * Passive Open Calls (4.2.2.18) 329 * MUST NOT let new passive open affect other connections. (doesn't) 330 * MUST support passive opens (LISTENs) concurrently. (does) 331 * 332 * Time to Live (4.2.2.19) 333 * MUST make TCP TTL configurable. (does - IP_TTL option) 334 * 335 * Event Processing (4.2.2.20) 336 * SHOULD queue out-of-order segments. (does) 337 * MUST aggregate ACK segments whenever possible. (does but badly) 338 * 339 * Retransmission Timeout Calculation (4.2.3.1) 340 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 341 * calculation. (does, or at least explains them in the comments 8*b) 342 * SHOULD initialize RTO to 0 and RTT to 3. (does) 343 * 344 * When to Send an ACK Segment (4.2.3.2) 345 * SHOULD implement delayed ACK. (does not) 346 * MUST keep ACK delay < 0.5 sec. (N/A) 347 * 348 * When to Send a Window Update (4.2.3.3) 349 * MUST implement receiver-side SWS. (does) 350 * 351 * When to Send Data (4.2.3.4) 352 * MUST implement sender-side SWS. (does) 353 * SHOULD implement Nagle algorithm. (does) 354 * 355 * TCP Connection Failures (4.2.3.5) 356 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 357 * SHOULD inform application layer of soft errors. (does) 358 * 359 * TCP Keep-Alives (4.2.3.6) 360 * MAY provide keep-alives. (does) 361 * MUST make keep-alives configurable on a per-connection basis. (does) 362 * MUST default to no keep-alives. (does) 363 * **MUST make keep-alive interval configurable. (doesn't) 364 * **MUST make default keep-alive interval > 2 hours. (doesn't) 365 * MUST NOT interpret failure to ACK keep-alive packet as dead 366 * connection. (doesn't) 367 * SHOULD send keep-alive with no data. (does) 368 * 369 * TCP Multihoming (4.2.3.7) 370 * MUST get source address from IP layer before sending first 371 * SYN. (does) 372 * MUST use same local address for all segments of a connection. (does) 373 * 374 * IP Options (4.2.3.8) 375 * MUST ignore unsupported IP options. (does) 376 * MAY support Time Stamp and Record Route. (does) 377 * MUST allow application to specify a source route. (does) 378 * MUST allow receieved Source Route option to set route for all future 379 * segments on this connection. (does not (security issues)) 380 * 381 * ICMP messages (4.2.3.9) 382 * MUST act on ICMP errors. (does) 383 * MUST slow transmission upon receipt of a Source Quench. (does) 384 * MUST NOT abort connection upon receipt of soft Destination 385 * Unreachables (0, 1, 5), Time Exceededs and Parameter 386 * Problems. (doesn't) 387 * SHOULD report soft Destination Unreachables etc. to the 388 * application. (does) 389 * SHOULD abort connection upon receipt of hard Destination Unreachable 390 * messages (2, 3, 4). (does) 391 * 392 * Remote Address Validation (4.2.3.10) 393 * MUST reject as an error OPEN for invalid remote IP address. (does) 394 * MUST ignore SYN with invalid source address. (does) 395 * MUST silently discard incoming SYN for broadcast/multicast 396 * address. (does) 397 * 398 * Asynchronous Reports (4.2.4.1) 399 * **MUST provide mechanism for reporting soft errors to application 400 * layer. (doesn't) 401 * 402 * Type of Service (4.2.4.2) 403 * MUST allow application layer to set Type of Service. (does IP_TOS) 404 * 405 * (Whew. -- MS 950903) 406 **/ 407
408 #include <linux/types.h>
409 #include <linux/sched.h>
410 #include <linux/mm.h>
411 #include <linux/time.h>
412 #include <linux/string.h>
413 #include <linux/config.h>
414 #include <linux/socket.h>
415 #include <linux/sockios.h>
416 #include <linux/termios.h>
417 #include <linux/in.h>
418 #include <linux/fcntl.h>
419 #include <linux/inet.h>
420 #include <linux/netdevice.h>
421 #include <net/snmp.h>
422 #include <net/ip.h>
423 #include <net/protocol.h>
424 #include <net/icmp.h>
425 #include <net/tcp.h>
426 #include <net/arp.h>
427 #include <linux/skbuff.h>
428 #include <net/sock.h>
429 #include <net/route.h>
430 #include <linux/errno.h>
431 #include <linux/timer.h>
432 #include <asm/system.h>
433 #include <asm/segment.h>
434 #include <linux/mm.h>
435 #include <net/checksum.h>
436
437 /* 438 * The MSL timer is the 'normal' timer. 439 */ 440
441 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
442
443 #define SEQ_TICK 3
444 unsignedlongseq_offset;
445 structtcp_mibtcp_statistics;
446
447 /* 448 * Cached last hit socket 449 */ 450
451 volatileunsignedlongth_cache_saddr,th_cache_daddr;
452 volatileunsignedshortth_cache_dport, th_cache_sport;
453 volatilestructsock *th_cache_sk;
454
455 voidtcp_cache_zap(void)
/* */ 456 { 457 unsignedlongflags;
458 save_flags(flags);
459 cli();
460 th_cache_saddr=0;
461 th_cache_daddr=0;
462 th_cache_dport=0;
463 th_cache_sport=0;
464 th_cache_sk=NULL;
465 restore_flags(flags);
466 } 467
468 staticvoidtcp_close(structsock *sk, inttimeout);
469
470
471 /* 472 * The less said about this the better, but it works and will do for 1.2 473 */ 474
475 staticstructwait_queue *master_select_wakeup;
476
477 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 478 { 479 if (a < b)
480 return(a);
481 return(b);
482 } 483
484 #undefSTATE_TRACE 485
486 #ifdefSTATE_TRACE 487 staticchar *statename[]={ 488 "Unused","Established","Syn Sent","Syn Recv",
489 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
490 "Close Wait","Last ACK","Listen","Closing"
491 };
492 #endif 493
494 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 495 { 496 if(sk->state==TCP_ESTABLISHED)
497 tcp_statistics.TcpCurrEstab--;
498 #ifdefSTATE_TRACE 499 if(sk->debug)
500 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
501 #endif 502 /* This is a hack but it doesn't occur often and it's going to 503 be a real to fix nicely */ 504
505 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
506 { 507 wake_up_interruptible(&master_select_wakeup);
508 } 509 sk->state=state;
510 if(state==TCP_ESTABLISHED)
511 tcp_statistics.TcpCurrEstab++;
512 if(sk->state==TCP_CLOSE)
513 tcp_cache_zap();
514 } 515
516 /* 517 * This routine picks a TCP windows for a socket based on 518 * the following constraints 519 * 520 * 1. The window can never be shrunk once it is offered (RFC 793) 521 * 2. We limit memory per socket 522 * 523 * For now we use NET2E3's heuristic of offering half the memory 524 * we have handy. All is not as bad as this seems however because 525 * of two things. Firstly we will bin packets even within the window 526 * in order to get the data we are waiting for into the memory limit. 527 * Secondly we bin common duplicate forms at receive time 528 * Better heuristics welcome 529 */ 530
531 inttcp_select_window(structsock *sk)
/* */ 532 { 533 intnew_window = sock_rspace(sk);
534
535 if(sk->window_clamp)
536 new_window=min(sk->window_clamp,new_window);
537 /* 538 * Two things are going on here. First, we don't ever offer a 539 * window less than min(sk->mss, MAX_WINDOW/2). This is the 540 * receiver side of SWS as specified in RFC1122. 541 * Second, we always give them at least the window they 542 * had before, in order to avoid retracting window. This 543 * is technically allowed, but RFC1122 advises against it and 544 * in practice it causes trouble. 545 * 546 * Fixme: This doesn't correctly handle the case where 547 * new_window > sk->window but not by enough to allow for the 548 * shift in sequence space. 549 */ 550 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
551 return(sk->window);
552 return(new_window);
553 } 554
555 /* 556 * Find someone to 'accept'. Must be called with 557 * sk->inuse=1 or cli() 558 */ 559
560 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 561 { 562 structsk_buff *p=skb_peek(&s->receive_queue);
563 if(p==NULL)
564 returnNULL;
565 do 566 { 567 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
568 returnp;
569 p=p->next;
570 } 571 while(p!=(structsk_buff *)&s->receive_queue);
572 returnNULL;
573 } 574
575 /* 576 * Remove a completed connection and return it. This is used by 577 * tcp_accept() to get connections from the queue. 578 */ 579
580 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 581 { 582 structsk_buff *skb;
583 unsignedlongflags;
584 save_flags(flags);
585 cli();
586 skb=tcp_find_established(s);
587 if(skb!=NULL)
588 skb_unlink(skb); /* Take it off the queue */ 589 restore_flags(flags);
590 returnskb;
591 } 592
593 /* 594 * This routine closes sockets which have been at least partially 595 * opened, but not yet accepted. Currently it is only called by 596 * tcp_close, and timeout mirrors the value there. 597 */ 598
599 staticvoidtcp_close_pending (structsock *sk)
/* */ 600 { 601 structsk_buff *skb;
602
603 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
604 { 605 skb->sk->dead=1;
606 tcp_close(skb->sk, 0);
607 kfree_skb(skb, FREE_READ);
608 } 609 return;
610 } 611
612 /* 613 * Enter the time wait state. 614 */ 615
616 staticvoidtcp_time_wait(structsock *sk)
/* */ 617 { 618 tcp_set_state(sk,TCP_TIME_WAIT);
619 sk->shutdown = SHUTDOWN_MASK;
620 if (!sk->dead)
621 sk->state_change(sk);
622 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
623 } 624
625 /* 626 * A socket has timed out on its send queue and wants to do a 627 * little retransmitting. Currently this means TCP. 628 */ 629
630 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 631 { 632 structsk_buff * skb;
633 structproto *prot;
634 structdevice *dev;
635 intct=0;
636 structrtable *rt;
637
638 prot = sk->prot;
639 skb = sk->send_head;
640
641 while (skb != NULL)
642 { 643 structtcphdr *th;
644 structiphdr *iph;
645 intsize;
646
647 dev = skb->dev;
648 IS_SKB(skb);
649 skb->when = jiffies;
650
651 /* 652 * Discard the surplus MAC header 653 */ 654
655 skb_pull(skb,((unsignedchar *)skb->ip_hdr)-skb->data);
656
657 /* 658 * In general it's OK just to use the old packet. However we 659 * need to use the current ack and window fields. Urg and 660 * urg_ptr could possibly stand to be updated as well, but we 661 * don't keep the necessary data. That shouldn't be a problem, 662 * if the other end is doing the right thing. Since we're 663 * changing the packet, we have to issue a new IP identifier. 664 */ 665
666 iph = (structiphdr *)skb->data;
667 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
668 size = ntohs(iph->tot_len) - (iph->ihl<<2);
669
670 /* 671 * Note: We ought to check for window limits here but 672 * currently this is done (less efficiently) elsewhere. 673 */ 674
675 /* 676 * Put a MAC header back on (may cause ARPing) 677 */ 678
679 { 680 /* ANK: UGLY, but the bug, that was here, should be fixed. 681 */ 682 structoptions * opt = (structoptions*)skb->proto_priv;
683 rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
684 } 685
686 iph->id = htons(ip_id_count++);
687 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY 688 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
689 iph->frag_off &= ~htons(IP_DF);
690 #endif 691 ip_send_check(iph);
692
693 if (rt==NULL) /* Deep poo */ 694 { 695 if(skb->sk)
696 { 697 skb->sk->err_soft=ENETUNREACH;
698 skb->sk->error_report(skb->sk);
699 } 700 } 701 else 702 { 703 dev=rt->rt_dev;
704 skb->raddr=rt->rt_gateway;
705 skb->dev=dev;
706 skb->arp=1;
707 if (rt->rt_hh)
708 { 709 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
710 if (!rt->rt_hh->hh_uptodate)
711 { 712 skb->arp = 0;
713 #ifRT_CACHE_DEBUG >= 2
714 printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
715 #endif 716 } 717 } 718 elseif (dev->hard_header)
719 { 720 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
721 skb->arp=0;
722 } 723
724 /* 725 * This is not the right way to handle this. We have to 726 * issue an up to date window and ack report with this 727 * retransmit to keep the odd buggy tcp that relies on 728 * the fact BSD does this happy. 729 * We don't however need to recalculate the entire 730 * checksum, so someone wanting a small problem to play 731 * with might like to implement RFC1141/RFC1624 and speed 732 * this up by avoiding a full checksum. 733 */ 734
735 th->ack_seq = htonl(sk->acked_seq);
736 th->window = ntohs(tcp_select_window(sk));
737 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
738
739 /* 740 * If the interface is (still) up and running, kick it. 741 */ 742
743 if (dev->flags & IFF_UP)
744 { 745 /* 746 * If the packet is still being sent by the device/protocol 747 * below then don't retransmit. This is both needed, and good - 748 * especially with connected mode AX.25 where it stops resends 749 * occurring of an as yet unsent anyway frame! 750 * We still add up the counts as the round trip time wants 751 * adjusting. 752 */ 753 if (sk && !skb_device_locked(skb))
754 { 755 /* Remove it from any existing driver queue first! */ 756 skb_unlink(skb);
757 /* Now queue it */ 758 ip_statistics.IpOutRequests++;
759 dev_queue_xmit(skb, dev, sk->priority);
760 } 761 } 762 } 763
764 /* 765 * Count retransmissions 766 */ 767
768 ct++;
769 sk->prot->retransmits ++;
770 tcp_statistics.TcpRetransSegs++;
771
772
773 /* 774 * Only one retransmit requested. 775 */ 776
777 if (!all)
778 break;
779
780 /* 781 * This should cut it off before we send too many packets. 782 */ 783
784 if (ct >= sk->cong_window)
785 break;
786 skb = skb->link3;
787 } 788 } 789
790 /* 791 * Reset the retransmission timer 792 */ 793
794 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 795 { 796 del_timer(&sk->retransmit_timer);
797 sk->ip_xmit_timeout = why;
798 if((int)when < 0)
799 { 800 when=3;
801 printk("Error: Negative timer in xmit_timer\n");
802 } 803 sk->retransmit_timer.expires=jiffies+when;
804 add_timer(&sk->retransmit_timer);
805 } 806
807 /* 808 * This is the normal code called for timeouts. It does the retransmission 809 * and then does backoff. tcp_do_retransmit is separated out because 810 * tcp_ack needs to send stuff from the retransmit queue without 811 * initiating a backoff. 812 */ 813
814
815 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 816 { 817 tcp_do_retransmit(sk, all);
818
819 /* 820 * Increase the timeout each time we retransmit. Note that 821 * we do not increase the rtt estimate. rto is initialized 822 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 823 * that doubling rto each time is the least we can get away with. 824 * In KA9Q, Karn uses this for the first few times, and then 825 * goes to quadratic. netBSD doubles, but only goes up to *64, 826 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 827 * defined in the protocol as the maximum possible RTT. I guess 828 * we'll have to use something other than TCP to talk to the 829 * University of Mars. 830 * 831 * PAWS allows us longer timeouts and large windows, so once 832 * implemented ftp to mars will work nicely. We will have to fix 833 * the 120 second clamps though! 834 */ 835
836 sk->retransmits++;
837 sk->prot->retransmits++;
838 sk->backoff++;
839 sk->rto = min(sk->rto << 1, 120*HZ);
840 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
841 } 842
843
844 /* 845 * A timer event has trigger a tcp retransmit timeout. The 846 * socket xmit queue is ready and set up to send. Because 847 * the ack receive code keeps the queue straight we do 848 * nothing clever here. 849 */ 850
851 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 852 { 853 if (all)
854 { 855 tcp_retransmit_time(sk, all);
856 return;
857 } 858
859 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 860 /* sk->ssthresh in theory can be zero. I guess that's OK */ 861 sk->cong_count = 0;
862
863 sk->cong_window = 1;
864
865 /* Do the actual retransmit. */ 866 tcp_retransmit_time(sk, all);
867 } 868
869 /* 870 * A write timeout has occurred. Process the after effects. 871 */ 872
873 staticinttcp_write_timeout(structsock *sk)
/* */ 874 { 875 /* 876 * Look for a 'soft' timeout. 877 */ 878 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
879 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
880 { 881 /* 882 * Attempt to recover if arp has changed (unlikely!) or 883 * a route has shifted (not supported prior to 1.3). 884 */ 885 ip_rt_advice(&sk->ip_route_cache, 0);
886 } 887
888 /* 889 * Have we tried to SYN too many times (repent repent 8)) 890 */ 891
892 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
893 { 894 if(sk->err_soft)
895 sk->err=sk->err_soft;
896 else 897 sk->err=ETIMEDOUT;
898 sk->error_report(sk);
899 del_timer(&sk->retransmit_timer);
900 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ 901 tcp_set_state(sk,TCP_CLOSE);
902 /* Don't FIN, we got nothing back */ 903 release_sock(sk);
904 return 0;
905 } 906 /* 907 * Has it gone just too far ? 908 */ 909 if (sk->retransmits > TCP_RETR2)
910 { 911 if(sk->err_soft)
912 sk->err = sk->err_soft;
913 else 914 sk->err = ETIMEDOUT;
915 sk->error_report(sk);
916 del_timer(&sk->retransmit_timer);
917 /* 918 * Time wait the socket 919 */ 920 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
921 { 922 tcp_set_state(sk,TCP_TIME_WAIT);
923 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
924 } 925 else 926 { 927 /* 928 * Clean up time. 929 */ 930 tcp_set_state(sk, TCP_CLOSE);
931 release_sock(sk);
932 return 0;
933 } 934 } 935 return 1;
936 } 937
938 /* 939 * The TCP retransmit timer. This lacks a few small details. 940 * 941 * 1. An initial rtt timeout on the probe0 should cause what we can 942 * of the first write queue buffer to be split and sent. 943 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 944 * ETIMEDOUT if we know an additional 'soft' error caused this. 945 * tcp_err should save a 'soft error' for us. 946 */ 947
948 staticvoidretransmit_timer(unsignedlongdata)
/* */ 949 { 950 structsock *sk = (structsock*)data;
951 intwhy = sk->ip_xmit_timeout;
952
953 /* 954 * only process if socket is not in use 955 */ 956
957 cli();
958 if (sk->inuse || in_bh)
959 { 960 /* Try again in 1 second */ 961 sk->retransmit_timer.expires = jiffies+HZ;
962 add_timer(&sk->retransmit_timer);
963 sti();
964 return;
965 } 966
967 sk->inuse = 1;
968 sti();
969
970 /* Always see if we need to send an ack. */ 971
972 if (sk->ack_backlog && !sk->zapped)
973 { 974 sk->prot->read_wakeup (sk);
975 if (! sk->dead)
976 sk->data_ready(sk,0);
977 } 978
979 /* Now we need to figure out why the socket was on the timer. */ 980
981 switch (why)
982 { 983 /* Window probing */ 984 caseTIME_PROBE0:
985 tcp_send_probe0(sk);
986 tcp_write_timeout(sk);
987 break;
988 /* Retransmitting */ 989 caseTIME_WRITE:
990 /* It could be we got here because we needed to send an ack. 991 * So we need to check for that. 992 */ 993 { 994 structsk_buff *skb;
995 unsignedlongflags;
996
997 save_flags(flags);
998 cli();
999 skb = sk->send_head;
1000 if (!skb)
1001 {1002 restore_flags(flags);
1003 }1004 else1005 {1006 /*1007 * Kicked by a delayed ack. Reset timer1008 * correctly now1009 */1010 if (jiffies < skb->when + sk->rto)
1011 {1012 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1013 restore_flags(flags);
1014 break;
1015 }1016 restore_flags(flags);
1017 /*1018 * Retransmission1019 */1020 sk->retransmits++;
1021 sk->prot->retransmits++;
1022 sk->prot->retransmit (sk, 0);
1023 tcp_write_timeout(sk);
1024 }1025 break;
1026 }1027 /* Sending Keepalives */1028 caseTIME_KEEPOPEN:
1029 /* 1030 * this reset_timer() call is a hack, this is not1031 * how KEEPOPEN is supposed to work.1032 */1033 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1034
1035 /* Send something to keep the connection open. */1036 if (sk->prot->write_wakeup)
1037 sk->prot->write_wakeup (sk);
1038 sk->retransmits++;
1039 sk->prot->retransmits++;
1040 tcp_write_timeout(sk);
1041 break;
1042 default:
1043 printk ("rexmit_timer: timer expired - reason unknown\n");
1044 break;
1045 }1046 release_sock(sk);
1047 }1048
1049 /*1050 * This routine is called by the ICMP module when it gets some1051 * sort of error condition. If err < 0 then the socket should1052 * be closed and the error returned to the user. If err > 01053 * it's just the icmp type << 8 | icmp code. After adjustment1054 * header points to the first 8 bytes of the tcp header. We need1055 * to find the appropriate port.1056 */1057
1058 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */1059 __u32saddr, structinet_protocol *protocol)
1060 {1061 structtcphdr *th = (structtcphdr *)header;
1062 structsock *sk;
1063
1064 /*1065 * This one is _WRONG_. FIXME urgently.1066 */1067 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1068 structiphdr *iph=(structiphdr *)(header-sizeof(structiphdr));
1069 #endif1070 th =(structtcphdr *)header;
1071 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1072
1073 if (sk == NULL)
1074 return;
1075
1076 if (type == ICMP_SOURCE_QUENCH)
1077 {1078 /*1079 * FIXME:1080 * For now we will just trigger a linear backoff.1081 * The slow start code should cause a real backoff here.1082 */1083 if (sk->cong_window > 4)
1084 sk->cong_window--;
1085 return;
1086 }1087
1088 if (type == ICMP_PARAMETERPROB)
1089 {1090 sk->err=EPROTO;
1091 sk->error_report(sk);
1092 }1093
1094 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1095 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1096 {1097 structrtable * rt;
1098 /*1099 * Ugly trick to pass MTU to protocol layer.1100 * Really we should add argument "info" to error handler.1101 */1102 unsignedshortnew_mtu = ntohs(iph->id);
1103
1104 if ((rt = sk->ip_route_cache) != NULL)
1105 if (rt->rt_mtu > new_mtu)
1106 rt->rt_mtu = new_mtu;
1107
1108 if (sk->mtu > new_mtu - sizeof(structiphdr) - sizeof(structtcphdr))
1109 sk->mtu = new_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
1110
1111 return;
1112 }1113 #endif1114
1115 /*1116 * If we've already connected we will keep trying1117 * until we time out, or the user gives up.1118 */1119
1120 if (code < 13)
1121 {1122 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1123 {1124 sk->err = icmp_err_convert[code].errno;
1125 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1126 {1127 tcp_statistics.TcpAttemptFails++;
1128 tcp_set_state(sk,TCP_CLOSE);
1129 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */1130 }1131 }1132 else/* Only an error on timeout */1133 sk->err_soft = icmp_err_convert[code].errno;
1134 }1135 }1136
1137
1138 /*1139 * Walk down the receive queue counting readable data until we hit the end or we find a gap1140 * in the received data queue (ie a frame missing that needs sending to us). Not1141 * sorting using two queues as data arrives makes life so much harder.1142 */1143
1144 staticinttcp_readable(structsock *sk)
/* */1145 {1146 unsignedlongcounted;
1147 unsignedlongamount;
1148 structsk_buff *skb;
1149 intsum;
1150 unsignedlongflags;
1151
1152 if(sk && sk->debug)
1153 printk("tcp_readable: %p - ",sk);
1154
1155 save_flags(flags);
1156 cli();
1157 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1158 {1159 restore_flags(flags);
1160 if(sk && sk->debug)
1161 printk("empty\n");
1162 return(0);
1163 }1164
1165 counted = sk->copied_seq; /* Where we are at the moment */1166 amount = 0;
1167
1168 /* 1169 * Do until a push or until we are out of data. 1170 */1171
1172 do1173 {1174 if (before(counted, skb->seq)) /* Found a hole so stops here */1175 break;
1176 sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */1177 if (skb->h.th->syn)
1178 sum++;
1179 if (sum > 0)
1180 {/* Add it up, move on */1181 amount += sum;
1182 if (skb->h.th->syn)
1183 amount--;
1184 counted += sum;
1185 }1186 /*1187 * Don't count urg data ... but do it in the right place!1188 * Consider: "old_data (ptr is here) URG PUSH data"1189 * The old code would stop at the first push because1190 * it counted the urg (amount==1) and then does amount--1191 * *after* the loop. This means tcp_readable() always1192 * returned zero if any URG PUSH was in the queue, even1193 * though there was normal data available. If we subtract1194 * the urg data right here, we even get it to work for more1195 * than one URG PUSH skb without normal data.1196 * This means that select() finally works now with urg data1197 * in the queue. Note that rlogin was never affected1198 * because it doesn't use select(); it uses two processes1199 * and a blocking read(). And the queue scan in tcp_read()1200 * was correct. Mike <pall@rz.uni-karlsruhe.de>1201 */1202 if (skb->h.th->urg)
1203 amount--; /* don't count urg data */1204 if (amount && skb->h.th->psh) break;
1205 skb = skb->next;
1206 }1207 while(skb != (structsk_buff *)&sk->receive_queue);
1208
1209 restore_flags(flags);
1210 if(sk->debug)
1211 printk("got %lu bytes.\n",amount);
1212 return(amount);
1213 }1214
1215 /*1216 * LISTEN is a special case for select..1217 */1218 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */1219 {1220 if (sel_type == SEL_IN) {1221 intretval;
1222
1223 sk->inuse = 1;
1224 retval = (tcp_find_established(sk) != NULL);
1225 release_sock(sk);
1226 if (!retval)
1227 select_wait(&master_select_wakeup,wait);
1228 returnretval;
1229 }1230 return 0;
1231 }1232
1233
1234 /*1235 * Wait for a TCP event.1236 *1237 * Note that we don't need to set "sk->inuse", as the upper select layers1238 * take care of normal races (between the test and the event) and we don't1239 * go look at any of the socket buffers directly.1240 */1241 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */1242 {1243 if (sk->state == TCP_LISTEN)
1244 returntcp_listen_select(sk, sel_type, wait);
1245
1246 switch(sel_type) {1247 caseSEL_IN:
1248 if (sk->err)
1249 return 1;
1250 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1251 break;
1252
1253 if (sk->shutdown & RCV_SHUTDOWN)
1254 return 1;
1255
1256 if (sk->acked_seq == sk->copied_seq)
1257 break;
1258
1259 if (sk->urg_seq != sk->copied_seq ||
1260 sk->acked_seq != sk->copied_seq+1 ||
1261 sk->urginline || !sk->urg_data)
1262 return 1;
1263 break;
1264
1265 caseSEL_OUT:
1266 if (sk->err)
1267 return 1;
1268 if (sk->shutdown & SEND_SHUTDOWN)
1269 return 0;
1270 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1271 break;
1272 /*1273 * This is now right thanks to a small fix1274 * by Matt Dillon.1275 */1276
1277 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1278 break;
1279 return 1;
1280
1281 caseSEL_EX:
1282 if (sk->urg_data)
1283 return 1;
1284 break;
1285 }1286 select_wait(sk->sleep, wait);
1287 return 0;
1288 }1289
1290 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */1291 {1292 interr;
1293 switch(cmd)
1294 {1295
1296 caseTIOCINQ:
1297 #ifdef FIXME /* FIXME: */1298 caseFIONREAD:
1299 #endif1300 {1301 unsignedlongamount;
1302
1303 if (sk->state == TCP_LISTEN)
1304 return(-EINVAL);
1305
1306 sk->inuse = 1;
1307 amount = tcp_readable(sk);
1308 release_sock(sk);
1309 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1310 if(err)
1311 returnerr;
1312 put_user(amount, (int *)arg);
1313 return(0);
1314 }1315 caseSIOCATMARK:
1316 {1317 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1318
1319 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1320 if (err)
1321 returnerr;
1322 put_user(answ,(int *) arg);
1323 return(0);
1324 }1325 caseTIOCOUTQ:
1326 {1327 unsignedlongamount;
1328
1329 if (sk->state == TCP_LISTEN) return(-EINVAL);
1330 amount = sock_wspace(sk);
1331 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1332 if(err)
1333 returnerr;
1334 put_user(amount, (int *)arg);
1335 return(0);
1336 }1337 default:
1338 return(-EINVAL);
1339 }1340 }1341
1342
1343 /*1344 * This routine computes a TCP checksum. 1345 *1346 * Modified January 1995 from a go-faster DOS routine by1347 * Jorge Cwik <jorge@laser.satlink.net>1348 */1349
1350 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1351 unsignedlongsaddr, unsignedlongdaddr, unsignedlongbase)
1352 {1353 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1354 }1355
1356
1357
1358 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1359 unsignedlongdaddr, intlen, structsock *sk)
1360 {1361 th->check = 0;
1362 th->check = tcp_check(th, len, saddr, daddr,
1363 csum_partial((char *)th,len,0));
1364 return;
1365 }1366
1367 /*1368 * This is the main buffer sending routine. We queue the buffer1369 * having checked it is sane seeming.1370 */1371
1372 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1373 {1374 intsize;
1375 structtcphdr * th = skb->h.th;
1376
1377 /*1378 * length of packet (not counting length of pre-tcp headers) 1379 */1380
1381 size = skb->len - ((unsignedchar *) th - skb->data);
1382
1383 /*1384 * Sanity check it.. 1385 */1386
1387 if (size < sizeof(structtcphdr) || size > skb->len)
1388 {1389 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1390 skb, skb->data, th, skb->len);
1391 kfree_skb(skb, FREE_WRITE);
1392 return;
1393 }1394
1395 /*1396 * If we have queued a header size packet.. (these crash a few1397 * tcp stacks if ack is not set)1398 */1399
1400 if (size == sizeof(structtcphdr))
1401 {1402 /* If it's got a syn or fin it's notionally included in the size..*/1403 if(!th->syn && !th->fin)
1404 {1405 printk("tcp_send_skb: attempt to queue a bogon.\n");
1406 kfree_skb(skb,FREE_WRITE);
1407 return;
1408 }1409 }1410
1411 /*1412 * Actual processing.1413 */1414
1415 tcp_statistics.TcpOutSegs++;
1416 skb->seq = ntohl(th->seq);
1417 skb->end_seq = skb->seq + size - 4*th->doff;
1418
1419 /*1420 * We must queue if1421 *1422 * a) The right edge of this frame exceeds the window1423 * b) We are retransmitting (Nagle's rule)1424 * c) We have too many packets 'in flight'1425 */1426
1427 if (after(skb->end_seq, sk->window_seq) ||
1428 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1429 sk->packets_out >= sk->cong_window)
1430 {1431 /* checksum will be supplied by tcp_write_xmit. So1432 * we shouldn't need to set it at all. I'm being paranoid */1433 th->check = 0;
1434 if (skb->next != NULL)
1435 {1436 printk("tcp_send_partial: next != NULL\n");
1437 skb_unlink(skb);
1438 }1439 skb_queue_tail(&sk->write_queue, skb);
1440
1441 /*1442 * If we don't fit we have to start the zero window1443 * probes. This is broken - we really need to do a partial1444 * send _first_ (This is what causes the Cisco and PC/TCP1445 * grief).1446 */1447
1448 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1449 sk->send_head == NULL && sk->ack_backlog == 0)
1450 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1451 }1452 else1453 {1454 /*1455 * This is going straight out1456 */1457
1458 th->ack_seq = htonl(sk->acked_seq);
1459 th->window = htons(tcp_select_window(sk));
1460
1461 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1462
1463 sk->sent_seq = sk->write_seq;
1464
1465 /*1466 * This is mad. The tcp retransmit queue is put together1467 * by the ip layer. This causes half the problems with1468 * unroutable FIN's and other things.1469 */1470
1471 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1472
1473 /*1474 * Set for next retransmit based on expected ACK time.1475 * FIXME: We set this every time which means our 1476 * retransmits are really about a window behind.1477 */1478
1479 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1480 }1481 }1482
1483 /*1484 * Locking problems lead us to a messy situation where we can have1485 * multiple partially complete buffers queued up. This is really bad1486 * as we don't want to be sending partial buffers. Fix this with1487 * a semaphore or similar to lock tcp_write per socket.1488 *1489 * These routines are pretty self descriptive.1490 */1491
1492 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1493 {1494 structsk_buff * skb;
1495 unsignedlongflags;
1496
1497 save_flags(flags);
1498 cli();
1499 skb = sk->partial;
1500 if (skb) {1501 sk->partial = NULL;
1502 del_timer(&sk->partial_timer);
1503 }1504 restore_flags(flags);
1505 returnskb;
1506 }1507
1508 /*1509 * Empty the partial queue1510 */1511
1512 staticvoidtcp_send_partial(structsock *sk)
/* */1513 {1514 structsk_buff *skb;
1515
1516 if (sk == NULL)
1517 return;
1518 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1519 tcp_send_skb(sk, skb);
1520 }1521
1522 /*1523 * Queue a partial frame1524 */1525
1526 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1527 {1528 structsk_buff * tmp;
1529 unsignedlongflags;
1530
1531 save_flags(flags);
1532 cli();
1533 tmp = sk->partial;
1534 if (tmp)
1535 del_timer(&sk->partial_timer);
1536 sk->partial = skb;
1537 init_timer(&sk->partial_timer);
1538 /*1539 * Wait up to 1 second for the buffer to fill.1540 */1541 sk->partial_timer.expires = jiffies+HZ;
1542 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1543 sk->partial_timer.data = (unsignedlong) sk;
1544 add_timer(&sk->partial_timer);
1545 restore_flags(flags);
1546 if (tmp)
1547 tcp_send_skb(sk, tmp);
1548 }1549
1550
1551 /*1552 * This routine sends an ack and also updates the window. 1553 */1554
1555 staticvoidtcp_send_ack(u32sequence, u32ack,
/* */1556 structsock *sk,
1557 structtcphdr *th, unsignedlongdaddr)
1558 {1559 structsk_buff *buff;
1560 structtcphdr *t1;
1561 structdevice *dev = NULL;
1562 inttmp;
1563
1564 if(sk->zapped)
1565 return; /* We have been reset, we may not send again */1566
1567 /*1568 * We need to grab some memory, and put together an ack,1569 * and then put it into the queue to be sent.1570 */1571
1572 buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1573 if (buff == NULL)
1574 {1575 /* 1576 * Force it to send an ack. We don't have to do this1577 * (ACK is unreliable) but it's much better use of 1578 * bandwidth on slow links to send a spare ack than1579 * resend packets. 1580 */1581
1582 sk->ack_backlog++;
1583 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1584 {1585 reset_xmit_timer(sk, TIME_WRITE, HZ);
1586 }1587 return;
1588 }1589
1590 /*1591 * Assemble a suitable TCP frame1592 */1593
1594 buff->sk = sk;
1595 buff->localroute = sk->localroute;
1596
1597 /* 1598 * Put in the IP header and routing stuff. 1599 */1600
1601 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1602 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1603 if (tmp < 0)
1604 {1605 buff->free = 1;
1606 sock_wfree(sk, buff);
1607 return;
1608 }1609 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1610
1611 memcpy(t1, th, sizeof(*t1));
1612
1613 /*1614 * Swap the send and the receive. 1615 */1616
1617 t1->dest = th->source;
1618 t1->source = th->dest;
1619 t1->seq = ntohl(sequence);
1620 t1->ack = 1;
1621 sk->window = tcp_select_window(sk);
1622 t1->window = ntohs(sk->window);
1623 t1->res1 = 0;
1624 t1->res2 = 0;
1625 t1->rst = 0;
1626 t1->urg = 0;
1627 t1->syn = 0;
1628 t1->psh = 0;
1629 t1->fin = 0;
1630
1631 /*1632 * If we have nothing queued for transmit and the transmit timer1633 * is on we are just doing an ACK timeout and need to switch1634 * to a keepalive.1635 */1636
1637 if (ack == sk->acked_seq)
1638 {1639 sk->ack_backlog = 0;
1640 sk->bytes_rcv = 0;
1641 sk->ack_timed = 0;
1642 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1643 && sk->ip_xmit_timeout == TIME_WRITE)
1644 {1645 if(sk->keepopen) {1646 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1647 }else{1648 delete_timer(sk);
1649 }1650 }1651 }1652
1653 /*1654 * Fill in the packet and send it1655 */1656
1657 t1->ack_seq = htonl(ack);
1658 t1->doff = sizeof(*t1)/4;
1659 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1660 if (sk->debug)
1661 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1662 tcp_statistics.TcpOutSegs++;
1663 sk->prot->queue_xmit(sk, dev, buff, 1);
1664 }1665
1666
1667 /* 1668 * This routine builds a generic TCP header. 1669 */1670
1671 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1672 {1673
1674 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1675 th->seq = htonl(sk->write_seq);
1676 th->psh =(push == 0) ? 1 : 0;
1677 th->doff = sizeof(*th)/4;
1678 th->ack = 1;
1679 th->fin = 0;
1680 sk->ack_backlog = 0;
1681 sk->bytes_rcv = 0;
1682 sk->ack_timed = 0;
1683 th->ack_seq = htonl(sk->acked_seq);
1684 sk->window = tcp_select_window(sk);
1685 th->window = htons(sk->window);
1686
1687 return(sizeof(*th));
1688 }1689
1690 /*1691 * This routine copies from a user buffer into a socket,1692 * and starts the transmit system.1693 */1694
1695 staticinttcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */1696 intlen, intnonblock, intflags)
1697 {1698 intcopied = 0;
1699 intcopy;
1700 inttmp;
1701 intseglen;
1702 intiovct=0;
1703 structsk_buff *skb;
1704 structsk_buff *send_tmp;
1705 structproto *prot;
1706 structdevice *dev = NULL;
1707 unsignedchar *from;
1708
1709 /*1710 * Do sanity checking for sendmsg/sendto/send1711 */1712
1713 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1714 return -EINVAL;
1715 if (msg->msg_name)
1716 {1717 structsockaddr_in *addr=(structsockaddr_in *)msg->msg_name;
1718 if(sk->state == TCP_CLOSE)
1719 return -ENOTCONN;
1720 if (msg->msg_namelen < sizeof(*addr))
1721 return -EINVAL;
1722 if (addr->sin_family && addr->sin_family != AF_INET)
1723 return -EINVAL;
1724 if (addr->sin_port != sk->dummy_th.dest)
1725 return -EISCONN;
1726 if (addr->sin_addr.s_addr != sk->daddr)
1727 return -EISCONN;
1728 }1729
1730 /*1731 * Ok commence sending1732 */1733
1734 while(iovct<msg->msg_iovlen)
1735 {1736 seglen=msg->msg_iov[iovct].iov_len;
1737 from=msg->msg_iov[iovct++].iov_base;
1738 sk->inuse=1;
1739 prot = sk->prot;
1740 while(seglen > 0)
1741 {1742 if (sk->err)
1743 {/* Stop on an error */1744 release_sock(sk);
1745 if (copied)
1746 return(copied);
1747 returnsock_error(sk);
1748 }1749
1750 /*1751 * First thing we do is make sure that we are established. 1752 */1753
1754 if (sk->shutdown & SEND_SHUTDOWN)
1755 {1756 release_sock(sk);
1757 sk->err = EPIPE;
1758 if (copied)
1759 return(copied);
1760 sk->err = 0;
1761 return(-EPIPE);
1762 }1763
1764 /* 1765 * Wait for a connection to finish.1766 */1767
1768 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1769 {1770 if (sk->err)
1771 {1772 release_sock(sk);
1773 if (copied)
1774 return(copied);
1775 returnsock_error(sk);
1776 }1777
1778 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1779 {1780 release_sock(sk);
1781 if (copied)
1782 return(copied);
1783
1784 if (sk->err)
1785 returnsock_error(sk);
1786
1787 if (sk->keepopen)
1788 {1789 send_sig(SIGPIPE, current, 0);
1790 }1791 return(-EPIPE);
1792 }1793
1794 if (nonblock || copied)
1795 {1796 release_sock(sk);
1797 if (copied)
1798 return(copied);
1799 return(-EAGAIN);
1800 }1801
1802 release_sock(sk);
1803 cli();
1804
1805 if (sk->state != TCP_ESTABLISHED &&
1806 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1807 {1808 interruptible_sleep_on(sk->sleep);
1809 if (current->signal & ~current->blocked)
1810 {1811 sti();
1812 if (copied)
1813 return(copied);
1814 return(-ERESTARTSYS);
1815 }1816 }1817 sk->inuse = 1;
1818 sti();
1819 }1820
1821 /*1822 * The following code can result in copy <= if sk->mss is ever1823 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1824 * sk->mtu is constant once SYN processing is finished. I.e. we1825 * had better not get here until we've seen his SYN and at least one1826 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1827 * But ESTABLISHED should guarantee that. sk->max_window is by definition1828 * non-decreasing. Note that any ioctl to set user_mss must be done1829 * before the exchange of SYN's. If the initial ack from the other1830 * end has a window of 0, max_window and thus mss will both be 0.1831 */1832
1833 /* 1834 * Now we need to check if we have a half built packet. 1835 */1836 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1837 /*1838 * FIXME: I'm almost sure that this fragment is BUG,1839 * but it works... I do not know why 8) --ANK1840 *1841 * Really, we should rebuild all the queues...1842 * It's difficult. Temprorary hack is to send all1843 * queued segments with allowed fragmentation.1844 */1845 {1846 intnew_mss = min(sk->mtu, sk->max_window);
1847 if (new_mss < sk->mss)
1848 {1849 tcp_send_partial(sk);
1850 sk->mss = new_mss;
1851 }1852 }1853 #endif1854
1855 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1856 {1857 inthdrlen;
1858
1859 /* IP header + TCP header */1860 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1861 + sizeof(structtcphdr);
1862
1863 /* Add more stuff to the end of skb->len */1864 if (!(flags & MSG_OOB))
1865 {1866 copy = min(sk->mss - (skb->len - hdrlen), seglen);
1867 if (copy <= 0)
1868 {1869 printk("TCP: **bug**: \"copy\" <= 0\n");
1870 return -EFAULT;
1871 }1872 memcpy_fromfs(skb_put(skb,copy), from, copy);
1873 from += copy;
1874 copied += copy;
1875 len -= copy;
1876 sk->write_seq += copy;
1877 seglen -= copy;
1878 }1879 if ((skb->len - hdrlen) >= sk->mss ||
1880 (flags & MSG_OOB) || !sk->packets_out)
1881 tcp_send_skb(sk, skb);
1882 else1883 tcp_enqueue_partial(skb, sk);
1884 continue;
1885 }1886
1887 /*1888 * We also need to worry about the window.1889 * If window < 1/2 the maximum window we've seen from this1890 * host, don't use it. This is sender side1891 * silly window prevention, as specified in RFC1122.1892 * (Note that this is different than earlier versions of1893 * SWS prevention, e.g. RFC813.). What we actually do is 1894 * use the whole MSS. Since the results in the right1895 * edge of the packet being outside the window, it will1896 * be queued for later rather than sent.1897 */1898
1899 copy = sk->window_seq - sk->write_seq;
1900 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1901 copy = sk->mss;
1902 if (copy > seglen)
1903 copy = seglen;
1904
1905 /*1906 * We should really check the window here also. 1907 */1908
1909 send_tmp = NULL;
1910 if (copy < sk->mss && !(flags & MSG_OOB))
1911 {1912 /*1913 * We will release the socket in case we sleep here. 1914 */1915 release_sock(sk);
1916 /*1917 * NB: following must be mtu, because mss can be increased.1918 * mss is always <= mtu 1919 */1920 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1921 sk->inuse = 1;
1922 send_tmp = skb;
1923 }1924 else1925 {1926 /*1927 * We will release the socket in case we sleep here. 1928 */1929 release_sock(sk);
1930 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1931 sk->inuse = 1;
1932 }1933
1934 /*1935 * If we didn't get any memory, we need to sleep. 1936 */1937
1938 if (skb == NULL)
1939 {1940 sk->socket->flags |= SO_NOSPACE;
1941 if (nonblock)
1942 {1943 release_sock(sk);
1944 if (copied)
1945 return(copied);
1946 return(-EAGAIN);
1947 }1948
1949 /*1950 * FIXME: here is another race condition. 1951 */1952
1953 tmp = sk->wmem_alloc;
1954 release_sock(sk);
1955 cli();
1956 /*1957 * Again we will try to avoid it. 1958 */1959 if (tmp <= sk->wmem_alloc &&
1960 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1961 && sk->err == 0)
1962 {1963 sk->socket->flags &= ~SO_NOSPACE;
1964 interruptible_sleep_on(sk->sleep);
1965 if (current->signal & ~current->blocked)
1966 {1967 sti();
1968 if (copied)
1969 return(copied);
1970 return(-ERESTARTSYS);
1971 }1972 }1973 sk->inuse = 1;
1974 sti();
1975 continue;
1976 }1977
1978 skb->sk = sk;
1979 skb->free = 0;
1980 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1981
1982 /*1983 * FIXME: we need to optimize this.1984 * Perhaps some hints here would be good.1985 */1986
1987 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1988 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1989 if (tmp < 0 )
1990 {1991 sock_wfree(sk, skb);
1992 release_sock(sk);
1993 if (copied)
1994 return(copied);
1995 return(tmp);
1996 }1997 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY1998 skb->ip_hdr->frag_off |= htons(IP_DF);
1999 #endif2000 skb->dev = dev;
2001 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
2002 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
2003 if (tmp < 0)
2004 {2005 sock_wfree(sk, skb);
2006 release_sock(sk);
2007 if (copied)
2008 return(copied);
2009 return(tmp);
2010 }2011
2012 if (flags & MSG_OOB)
2013 {2014 skb->h.th->urg = 1;
2015 skb->h.th->urg_ptr = ntohs(copy);
2016 }2017
2018 memcpy_fromfs(skb_put(skb,copy), from, copy);
2019
2020 from += copy;
2021 copied += copy;
2022 len -= copy;
2023 seglen -= copy;
2024 skb->free = 0;
2025 sk->write_seq += copy;
2026
2027 if (send_tmp != NULL && sk->packets_out)
2028 {2029 tcp_enqueue_partial(send_tmp, sk);
2030 continue;
2031 }2032 tcp_send_skb(sk, skb);
2033 }2034 }2035 sk->err = 0;
2036
2037 /*2038 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly2039 * interactive fast network servers. It's meant to be on and2040 * it really improves the throughput though not the echo time2041 * on my slow slip link - Alan2042 */2043
2044 /*2045 * Avoid possible race on send_tmp - c/o Johannes Stille 2046 */2047
2048 if(sk->partial && ((!sk->packets_out)
2049 /* If not nagling we can send on the before case too.. */2050 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2051 ))
2052 tcp_send_partial(sk);
2053
2054 release_sock(sk);
2055 return(copied);
2056 }2057
2058 /*2059 * Send an ack if one is backlogged at this point. Ought to merge2060 * this with tcp_send_ack().2061 */2062
2063 staticvoidtcp_read_wakeup(structsock *sk)
/* */2064 {2065 inttmp;
2066 structdevice *dev = NULL;
2067 structtcphdr *t1;
2068 structsk_buff *buff;
2069
2070 if (!sk->ack_backlog)
2071 return;
2072
2073 /*2074 * If we're closed, don't send an ack, or we'll get a RST2075 * from the closed destination.2076 */2077 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2078 return;
2079
2080 /*2081 * FIXME: we need to put code here to prevent this routine from2082 * being called. Being called once in a while is ok, so only check2083 * if this is the second time in a row.2084 */2085
2086 /*2087 * We need to grab some memory, and put together an ack,2088 * and then put it into the queue to be sent.2089 */2090
2091 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2092 if (buff == NULL)
2093 {2094 /* Try again real soon. */2095 reset_xmit_timer(sk, TIME_WRITE, HZ);
2096 return;
2097 }2098
2099 buff->sk = sk;
2100 buff->localroute = sk->localroute;
2101
2102 /*2103 * Put in the IP header and routing stuff. 2104 */2105
2106 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2107 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2108 if (tmp < 0)
2109 {2110 buff->free = 1;
2111 sock_wfree(sk, buff);
2112 return;
2113 }2114
2115 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2116
2117 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2118 t1->seq = htonl(sk->sent_seq);
2119 t1->ack = 1;
2120 t1->res1 = 0;
2121 t1->res2 = 0;
2122 t1->rst = 0;
2123 t1->urg = 0;
2124 t1->syn = 0;
2125 t1->psh = 0;
2126 sk->ack_backlog = 0;
2127 sk->bytes_rcv = 0;
2128 sk->window = tcp_select_window(sk);
2129 t1->window = htons(sk->window);
2130 t1->ack_seq = htonl(sk->acked_seq);
2131 t1->doff = sizeof(*t1)/4;
2132 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2133 sk->prot->queue_xmit(sk, dev, buff, 1);
2134 tcp_statistics.TcpOutSegs++;
2135 }2136
2137
2138 /*2139 * FIXME:2140 * This routine frees used buffers.2141 * It should consider sending an ACK to let the2142 * other end know we now have a bigger window.2143 */2144
2145 staticvoidcleanup_rbuf(structsock *sk)
/* */2146 {2147 unsignedlongflags;
2148 unsignedlongleft;
2149 structsk_buff *skb;
2150 unsignedlongrspace;
2151
2152 if(sk->debug)
2153 printk("cleaning rbuf for sk=%p\n", sk);
2154
2155 save_flags(flags);
2156 cli();
2157
2158 left = sock_rspace(sk);
2159
2160 /*2161 * We have to loop through all the buffer headers,2162 * and try to free up all the space we can.2163 */2164
2165 while((skb=skb_peek(&sk->receive_queue)) != NULL)
2166 {2167 if (!skb->used || skb->users)
2168 break;
2169 skb_unlink(skb);
2170 skb->sk = sk;
2171 kfree_skb(skb, FREE_READ);
2172 }2173
2174 restore_flags(flags);
2175
2176 /*2177 * FIXME:2178 * At this point we should send an ack if the difference2179 * in the window, and the amount of space is bigger than2180 * TCP_WINDOW_DIFF.2181 */2182
2183 if(sk->debug)
2184 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2185 left);
2186 if ((rspace=sock_rspace(sk)) != left)
2187 {2188 /*2189 * This area has caused the most trouble. The current strategy2190 * is to simply do nothing if the other end has room to send at2191 * least 3 full packets, because the ack from those will auto-2192 * matically update the window. If the other end doesn't think2193 * we have much space left, but we have room for at least 1 more2194 * complete packet than it thinks we do, we will send an ack2195 * immediately. Otherwise we will wait up to .5 seconds in case2196 * the user reads some more.2197 */2198 sk->ack_backlog++;
2199 /*2200 * It's unclear whether to use sk->mtu or sk->mss here. They differ only2201 * if the other end is offering a window smaller than the agreed on MSS2202 * (called sk->mtu here). In theory there's no connection between send2203 * and receive, and so no reason to think that they're going to send2204 * small packets. For the moment I'm using the hack of reducing the mss2205 * only on the send side, so I'm putting mtu here.2206 */2207
2208 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2209 {2210 /* Send an ack right now. */2211 tcp_read_wakeup(sk);
2212 }2213 else2214 {2215 /* Force it to send an ack soon. */2216 intwas_active = del_timer(&sk->retransmit_timer);
2217 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2218 {2219 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2220 }2221 else2222 add_timer(&sk->retransmit_timer);
2223 }2224 }2225 }2226
2227
2228 /*2229 * Handle reading urgent data. BSD has very simple semantics for2230 * this, no blocking and very strange errors 8)2231 */2232
2233 staticinttcp_recv_urg(structsock * sk, intnonblock,
/* */2234 structmsghdr *msg, intlen, intflags, int *addr_len)
2235 {2236 /*2237 * No URG data to read2238 */2239 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2240 return -EINVAL; /* Yes this is right ! */2241
2242 if (sk->err)
2243 returnsock_error(sk);
2244
2245 if (sk->state == TCP_CLOSE || sk->done)
2246 {2247 if (!sk->done)
2248 {2249 sk->done = 1;
2250 return 0;
2251 }2252 return -ENOTCONN;
2253 }2254
2255 if (sk->shutdown & RCV_SHUTDOWN)
2256 {2257 sk->done = 1;
2258 return 0;
2259 }2260 sk->inuse = 1;
2261 if (sk->urg_data & URG_VALID)
2262 {2263 charc = sk->urg_data;
2264 if (!(flags & MSG_PEEK))
2265 sk->urg_data = URG_READ;
2266 memcpy_toiovec(msg->msg_iov, &c, 1);
2267 if(msg->msg_name)
2268 {2269 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2270 sin->sin_family=AF_INET;
2271 sin->sin_addr.s_addr=sk->daddr;
2272 sin->sin_port=sk->dummy_th.dest;
2273 }2274 if(addr_len)
2275 *addr_len=sizeof(structsockaddr_in);
2276 release_sock(sk);
2277 return 1;
2278 }2279 release_sock(sk);
2280
2281 /*2282 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2283 * the available implementations agree in this case:2284 * this call should never block, independent of the2285 * blocking state of the socket.2286 * Mike <pall@rz.uni-karlsruhe.de>2287 */2288 return -EAGAIN;
2289 }2290
2291
2292 /*2293 * This routine copies from a sock struct into the user buffer. 2294 */2295
2296 staticinttcp_recvmsg(structsock *sk, structmsghdr *msg,
/* */2297 intlen, intnonblock, intflags, int *addr_len)
2298 {2299 structwait_queuewait = {current, NULL};
2300 intcopied = 0;
2301 u32peek_seq;
2302 volatileu32 *seq; /* So gcc doesn't overoptimise */2303 unsignedlongused;
2304
2305 /* 2306 * This error should be checked. 2307 */2308
2309 if (sk->state == TCP_LISTEN)
2310 return -ENOTCONN;
2311
2312 /*2313 * Urgent data needs to be handled specially. 2314 */2315
2316 if (flags & MSG_OOB)
2317 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2318
2319 /*2320 * Copying sequence to update. This is volatile to handle2321 * the multi-reader case neatly (memcpy_to/fromfs might be 2322 * inline and thus not flush cached variables otherwise).2323 */2324
2325 peek_seq = sk->copied_seq;
2326 seq = &sk->copied_seq;
2327 if (flags & MSG_PEEK)
2328 seq = &peek_seq;
2329
2330 add_wait_queue(sk->sleep, &wait);
2331 sk->inuse = 1;
2332 while (len > 0)
2333 {2334 structsk_buff * skb;
2335 u32offset;
2336
2337 /*2338 * Are we at urgent data? Stop if we have read anything.2339 */2340
2341 if (copied && sk->urg_data && sk->urg_seq == *seq)
2342 break;
2343
2344 /*2345 * Next get a buffer.2346 */2347
2348 current->state = TASK_INTERRUPTIBLE;
2349
2350 skb = skb_peek(&sk->receive_queue);
2351 do2352 {2353 if (!skb)
2354 break;
2355 if (before(*seq, skb->seq))
2356 break;
2357 offset = *seq - skb->seq;
2358 if (skb->h.th->syn)
2359 offset--;
2360 if (offset < skb->len)
2361 gotofound_ok_skb;
2362 if (skb->h.th->fin)
2363 gotofound_fin_ok;
2364 if (!(flags & MSG_PEEK))
2365 skb->used = 1;
2366 skb = skb->next;
2367 }2368 while (skb != (structsk_buff *)&sk->receive_queue);
2369
2370 if (copied)
2371 break;
2372
2373 if (sk->err)
2374 {2375 copied = sock_error(sk);
2376 break;
2377 }2378
2379 if (sk->state == TCP_CLOSE)
2380 {2381 if (!sk->done)
2382 {2383 sk->done = 1;
2384 break;
2385 }2386 copied = -ENOTCONN;
2387 break;
2388 }2389
2390 if (sk->shutdown & RCV_SHUTDOWN)
2391 {2392 sk->done = 1;
2393 break;
2394 }2395
2396 if (nonblock)
2397 {2398 copied = -EAGAIN;
2399 break;
2400 }2401
2402 cleanup_rbuf(sk);
2403 release_sock(sk);
2404 sk->socket->flags |= SO_WAITDATA;
2405 schedule();
2406 sk->socket->flags &= ~SO_WAITDATA;
2407 sk->inuse = 1;
2408
2409 if (current->signal & ~current->blocked)
2410 {2411 copied = -ERESTARTSYS;
2412 break;
2413 }2414 continue;
2415
2416 found_ok_skb:
2417 /*2418 * Lock the buffer. We can be fairly relaxed as2419 * an interrupt will never steal a buffer we are 2420 * using unless I've missed something serious in2421 * tcp_data.2422 */2423
2424 skb->users++;
2425
2426 /*2427 * Ok so how much can we use ? 2428 */2429
2430 used = skb->len - offset;
2431 if (len < used)
2432 used = len;
2433 /*2434 * Do we have urgent data here? 2435 */2436
2437 if (sk->urg_data)
2438 {2439 u32urg_offset = sk->urg_seq - *seq;
2440 if (urg_offset < used)
2441 {2442 if (!urg_offset)
2443 {2444 if (!sk->urginline)
2445 {2446 ++*seq;
2447 offset++;
2448 used--;
2449 }2450 }2451 else2452 used = urg_offset;
2453 }2454 }2455
2456 /*2457 * Copy it - We _MUST_ update *seq first so that we2458 * don't ever double read when we have dual readers2459 */2460
2461 *seq += used;
2462
2463 /*2464 * This memcpy_tofs can sleep. If it sleeps and we2465 * do a second read it relies on the skb->users to avoid2466 * a crash when cleanup_rbuf() gets called.2467 */2468
2469 memcpy_toiovec(msg->msg_iov,((unsignedchar *)skb->h.th) +
2470 skb->h.th->doff*4 + offset, used);
2471 copied += used;
2472 len -= used;
2473
2474 /*2475 * We now will not sleep again until we are finished2476 * with skb. Sorry if you are doing the SMP port2477 * but you'll just have to fix it neatly ;)2478 */2479
2480 skb->users --;
2481
2482 if (after(sk->copied_seq,sk->urg_seq))
2483 sk->urg_data = 0;
2484 if (used + offset < skb->len)
2485 continue;
2486
2487 /*2488 * Process the FIN.2489 */2490
2491 if (skb->h.th->fin)
2492 gotofound_fin_ok;
2493 if (flags & MSG_PEEK)
2494 continue;
2495 skb->used = 1;
2496 continue;
2497
2498 found_fin_ok:
2499 ++*seq;
2500 if (flags & MSG_PEEK)
2501 break;
2502
2503 /*2504 * All is done2505 */2506
2507 skb->used = 1;
2508 sk->shutdown |= RCV_SHUTDOWN;
2509 break;
2510
2511 }2512
2513 if(copied>0 && msg->msg_name)
2514 {2515 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2516 sin->sin_family=AF_INET;
2517 sin->sin_addr.s_addr=sk->daddr;
2518 sin->sin_port=sk->dummy_th.dest;
2519 }2520 if(addr_len)
2521 *addr_len=sizeof(structsockaddr_in);
2522
2523 remove_wait_queue(sk->sleep, &wait);
2524 current->state = TASK_RUNNING;
2525
2526 /* Clean up data we have read: This will do ACK frames */2527 cleanup_rbuf(sk);
2528 release_sock(sk);
2529 returncopied;
2530 }2531
2532
2533
2534 /*2535 * State processing on a close. This implements the state shift for2536 * sending our FIN frame. Note that we only send a FIN for some 2537 * states. A shutdown() may have already sent the FIN, or we may be2538 * closed.2539 */2540
2541 staticinttcp_close_state(structsock *sk, intdead)
/* */2542 {2543 intns=TCP_CLOSE;
2544 intsend_fin=0;
2545 switch(sk->state)
2546 {2547 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2548 break;
2549 caseTCP_SYN_RECV:
2550 caseTCP_ESTABLISHED: /* Closedown begin */2551 ns=TCP_FIN_WAIT1;
2552 send_fin=1;
2553 break;
2554 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2555 caseTCP_FIN_WAIT2:
2556 caseTCP_CLOSING:
2557 ns=sk->state;
2558 break;
2559 caseTCP_CLOSE:
2560 caseTCP_LISTEN:
2561 break;
2562 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2563 wait only for the ACK */2564 ns=TCP_LAST_ACK;
2565 send_fin=1;
2566 }2567
2568 tcp_set_state(sk,ns);
2569
2570 /*2571 * This is a (useful) BSD violating of the RFC. There is a2572 * problem with TCP as specified in that the other end could2573 * keep a socket open forever with no application left this end.2574 * We use a 3 minute timeout (about the same as BSD) then kill2575 * our end. If they send after that then tough - BUT: long enough2576 * that we won't make the old 4*rto = almost no time - whoops2577 * reset mistake.2578 */2579 if(dead && ns==TCP_FIN_WAIT2)
2580 {2581 inttimer_active=del_timer(&sk->timer);
2582 if(timer_active)
2583 add_timer(&sk->timer);
2584 else2585 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2586 }2587
2588 returnsend_fin;
2589 }2590
2591 /*2592 * Send a fin.2593 */2594
2595 staticvoidtcp_send_fin(structsock *sk)
/* */2596 {2597 structproto *prot =(structproto *)sk->prot;
2598 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2599 structtcphdr *t1;
2600 structsk_buff *buff;
2601 structdevice *dev=NULL;
2602 inttmp;
2603
2604 release_sock(sk); /* in case the malloc sleeps. */2605
2606 buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2607 sk->inuse = 1;
2608
2609 if (buff == NULL)
2610 {2611 /* This is a disaster if it occurs */2612 printk("tcp_send_fin: Impossible malloc failure");
2613 return;
2614 }2615
2616 /*2617 * Administrivia2618 */2619
2620 buff->sk = sk;
2621 buff->localroute = sk->localroute;
2622
2623 /*2624 * Put in the IP header and routing stuff. 2625 */2626
2627 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2628 IPPROTO_TCP, sk->opt,
2629 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2630 if (tmp < 0)
2631 {2632 intt;
2633 /*2634 * Finish anyway, treat this as a send that got lost. 2635 * (Not good).2636 */2637
2638 buff->free = 1;
2639 sock_wfree(sk,buff);
2640 sk->write_seq++;
2641 t=del_timer(&sk->timer);
2642 if(t)
2643 add_timer(&sk->timer);
2644 else2645 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2646 return;
2647 }2648
2649 /*2650 * We ought to check if the end of the queue is a buffer and2651 * if so simply add the fin to that buffer, not send it ahead.2652 */2653
2654 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2655 buff->dev = dev;
2656 memcpy(t1, th, sizeof(*t1));
2657 buff->seq = sk->write_seq;
2658 sk->write_seq++;
2659 buff->end_seq = sk->write_seq;
2660 t1->seq = htonl(buff->seq);
2661 t1->ack = 1;
2662 t1->ack_seq = htonl(sk->acked_seq);
2663 t1->window = htons(sk->window=tcp_select_window(sk));
2664 t1->fin = 1;
2665 t1->rst = 0;
2666 t1->doff = sizeof(*t1)/4;
2667 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2668
2669 /*2670 * If there is data in the write queue, the fin must be appended to2671 * the write queue.2672 */2673
2674 if (skb_peek(&sk->write_queue) != NULL)
2675 {2676 buff->free = 0;
2677 if (buff->next != NULL)
2678 {2679 printk("tcp_send_fin: next != NULL\n");
2680 skb_unlink(buff);
2681 }2682 skb_queue_tail(&sk->write_queue, buff);
2683 }2684 else2685 {2686 sk->sent_seq = sk->write_seq;
2687 sk->prot->queue_xmit(sk, dev, buff, 0);
2688 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2689 }2690 }2691
2692 /*2693 * Shutdown the sending side of a connection. Much like close except2694 * that we don't receive shut down or set sk->dead=1.2695 */2696
2697 voidtcp_shutdown(structsock *sk, inthow)
/* */2698 {2699 /*2700 * We need to grab some memory, and put together a FIN,2701 * and then put it into the queue to be sent.2702 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2703 */2704
2705 if (!(how & SEND_SHUTDOWN))
2706 return;
2707
2708 /*2709 * If we've already sent a FIN, or it's a closed state2710 */2711
2712 if (sk->state == TCP_FIN_WAIT1 ||
2713 sk->state == TCP_FIN_WAIT2 ||
2714 sk->state == TCP_CLOSING ||
2715 sk->state == TCP_LAST_ACK ||
2716 sk->state == TCP_TIME_WAIT ||
2717 sk->state == TCP_CLOSE ||
2718 sk->state == TCP_LISTEN2719 )
2720 {2721 return;
2722 }2723 sk->inuse = 1;
2724
2725 /*2726 * flag that the sender has shutdown2727 */2728
2729 sk->shutdown |= SEND_SHUTDOWN;
2730
2731 /*2732 * Clear out any half completed packets. 2733 */2734
2735 if (sk->partial)
2736 tcp_send_partial(sk);
2737
2738 /*2739 * FIN if needed2740 */2741
2742 if(tcp_close_state(sk,0))
2743 tcp_send_fin(sk);
2744
2745 release_sock(sk);
2746 }2747
2748 /*2749 * This routine will send an RST to the other tcp. 2750 */2751
2752 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2753 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2754 {2755 structsk_buff *buff;
2756 structtcphdr *t1;
2757 inttmp;
2758 structdevice *ndev=NULL;
2759
2760 /*2761 * Cannot reset a reset (Think about it).2762 */2763
2764 if(th->rst)
2765 return;
2766
2767 /*2768 * We need to grab some memory, and put together an RST,2769 * and then put it into the queue to be sent.2770 */2771
2772 buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2773 if (buff == NULL)
2774 return;
2775
2776 buff->sk = NULL;
2777 buff->dev = dev;
2778 buff->localroute = 0;
2779
2780 /*2781 * Put in the IP header and routing stuff. 2782 */2783
2784 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2785 sizeof(structtcphdr),tos,ttl,NULL);
2786 if (tmp < 0)
2787 {2788 buff->free = 1;
2789 sock_wfree(NULL, buff);
2790 return;
2791 }2792
2793 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2794 memcpy(t1, th, sizeof(*t1));
2795
2796 /*2797 * Swap the send and the receive. 2798 */2799
2800 t1->dest = th->source;
2801 t1->source = th->dest;
2802 t1->rst = 1;
2803 t1->window = 0;
2804
2805 if(th->ack)
2806 {2807 t1->ack = 0;
2808 t1->seq = th->ack_seq;
2809 t1->ack_seq = 0;
2810 }2811 else2812 {2813 t1->ack = 1;
2814 if(!th->syn)
2815 t1->ack_seq = th->seq;
2816 else2817 t1->ack_seq = htonl(ntohl(th->seq)+1);
2818 t1->seq = 0;
2819 }2820
2821 t1->syn = 0;
2822 t1->urg = 0;
2823 t1->fin = 0;
2824 t1->psh = 0;
2825 t1->doff = sizeof(*t1)/4;
2826 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2827 prot->queue_xmit(NULL, ndev, buff, 1);
2828 tcp_statistics.TcpOutSegs++;
2829 }2830
2831
2832 /*2833 * Look for tcp options. Parses everything but only knows about MSS.2834 * This routine is always called with the packet containing the SYN.2835 * However it may also be called with the ack to the SYN. So you2836 * can't assume this is always the SYN. It's always called after2837 * we have set up sk->mtu to our own MTU.2838 *2839 * We need at minimum to add PAWS support here. Possibly large windows2840 * as Linux gets deployed on 100Mb/sec networks.2841 */2842
2843 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2844 {2845 unsignedchar *ptr;
2846 intlength=(th->doff*4)-sizeof(structtcphdr);
2847 intmss_seen = 0;
2848
2849 ptr = (unsignedchar *)(th + 1);
2850
2851 while(length>0)
2852 {2853 intopcode=*ptr++;
2854 intopsize=*ptr++;
2855 switch(opcode)
2856 {2857 caseTCPOPT_EOL:
2858 return;
2859 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2860 length--;
2861 ptr--; /* the opsize=*ptr++ above was a mistake */2862 continue;
2863
2864 default:
2865 if(opsize<=2) /* Avoid silly options looping forever */2866 return;
2867 switch(opcode)
2868 {2869 caseTCPOPT_MSS:
2870 if(opsize==4 && th->syn)
2871 {2872 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2873 mss_seen = 1;
2874 }2875 break;
2876 /* Add other options here as people feel the urge to implement stuff like large windows */2877 }2878 ptr+=opsize-2;
2879 length-=opsize;
2880 }2881 }2882 if (th->syn)
2883 {2884 if (! mss_seen)
2885 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2886 }2887 #ifdefCONFIG_INET_PCTCP2888 sk->mss = min(sk->max_window >> 1, sk->mtu);
2889 #else2890 sk->mss = min(sk->max_window, sk->mtu);
2891 #endif2892 }2893
2894 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2895 {2896 dst = ntohl(dst);
2897 if (IN_CLASSA(dst))
2898 returnhtonl(IN_CLASSA_NET);
2899 if (IN_CLASSB(dst))
2900 returnhtonl(IN_CLASSB_NET);
2901 returnhtonl(IN_CLASSC_NET);
2902 }2903
2904 /*2905 * Default sequence number picking algorithm.2906 * As close as possible to RFC 793, which2907 * suggests using a 250kHz clock.2908 * Further reading shows this assumes 2MB/s networks.2909 * For 10MB/s ethernet, a 1MHz clock is appropriate.2910 * That's funny, Linux has one built in! Use it!2911 */2912
2913 externinlineu32tcp_init_seq(void)
/* */2914 {2915 structtimevaltv;
2916 do_gettimeofday(&tv);
2917 returntv.tv_usec+tv.tv_sec*1000000;
2918 }2919
2920 /*2921 * This routine handles a connection request.2922 * It should make sure we haven't already responded.2923 * Because of the way BSD works, we have to send a syn/ack now.2924 * This also means it will be harder to close a socket which is2925 * listening.2926 */2927
2928 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2929 unsignedlongdaddr, unsignedlongsaddr,
2930 structoptions *opt, structdevice *dev, u32seq)
2931 {2932 structsk_buff *buff;
2933 structtcphdr *t1;
2934 unsignedchar *ptr;
2935 structsock *newsk;
2936 structtcphdr *th;
2937 structdevice *ndev=NULL;
2938 inttmp;
2939 structrtable *rt;
2940
2941 th = skb->h.th;
2942
2943 /* If the socket is dead, don't accept the connection. */2944 if (!sk->dead)
2945 {2946 sk->data_ready(sk,0);
2947 }2948 else2949 {2950 if(sk->debug)
2951 printk("Reset on %p: Connect on dead socket.\n",sk);
2952 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2953 tcp_statistics.TcpAttemptFails++;
2954 kfree_skb(skb, FREE_READ);
2955 return;
2956 }2957
2958 /*2959 * Make sure we can accept more. This will prevent a2960 * flurry of syns from eating up all our memory.2961 */2962
2963 if (sk->ack_backlog >= sk->max_ack_backlog)
2964 {2965 tcp_statistics.TcpAttemptFails++;
2966 kfree_skb(skb, FREE_READ);
2967 return;
2968 }2969
2970 /*2971 * We need to build a new sock struct.2972 * It is sort of bad to have a socket without an inode attached2973 * to it, but the wake_up's will just wake up the listening socket,2974 * and if the listening socket is destroyed before this is taken2975 * off of the queue, this will take care of it.2976 */2977
2978 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2979 if (newsk == NULL)
2980 {2981 /* just ignore the syn. It will get retransmitted. */2982 tcp_statistics.TcpAttemptFails++;
2983 kfree_skb(skb, FREE_READ);
2984 return;
2985 }2986
2987 memcpy(newsk, sk, sizeof(*newsk));
2988 newsk->opt = NULL;
2989 newsk->ip_route_cache = NULL;
2990 if (opt && opt->optlen) {2991 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
2992 if (!sk->opt) {2993 kfree_s(newsk, sizeof(structsock));
2994 tcp_statistics.TcpAttemptFails++;
2995 kfree_skb(skb, FREE_READ);
2996 return;
2997 }2998 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {2999 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
3000 kfree_s(newsk, sizeof(structsock));
3001 tcp_statistics.TcpAttemptFails++;
3002 kfree_skb(skb, FREE_READ);
3003 return;
3004 }3005 }3006 skb_queue_head_init(&newsk->write_queue);
3007 skb_queue_head_init(&newsk->receive_queue);
3008 newsk->send_head = NULL;
3009 newsk->send_tail = NULL;
3010 skb_queue_head_init(&newsk->back_log);
3011 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/3012 newsk->rto = TCP_TIMEOUT_INIT;
3013 newsk->mdev = 0;
3014 newsk->max_window = 0;
3015 newsk->cong_window = 1;
3016 newsk->cong_count = 0;
3017 newsk->ssthresh = 0;
3018 newsk->backoff = 0;
3019 newsk->blog = 0;
3020 newsk->intr = 0;
3021 newsk->proc = 0;
3022 newsk->done = 0;
3023 newsk->partial = NULL;
3024 newsk->pair = NULL;
3025 newsk->wmem_alloc = 0;
3026 newsk->rmem_alloc = 0;
3027 newsk->localroute = sk->localroute;
3028
3029 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3030
3031 newsk->err = 0;
3032 newsk->shutdown = 0;
3033 newsk->ack_backlog = 0;
3034 newsk->acked_seq = skb->seq+1;
3035 newsk->copied_seq = skb->seq+1;
3036 newsk->fin_seq = skb->seq;
3037 newsk->state = TCP_SYN_RECV;
3038 newsk->timeout = 0;
3039 newsk->ip_xmit_timeout = 0;
3040 newsk->write_seq = seq;
3041 newsk->window_seq = newsk->write_seq;
3042 newsk->rcv_ack_seq = newsk->write_seq;
3043 newsk->urg_data = 0;
3044 newsk->retransmits = 0;
3045 newsk->linger=0;
3046 newsk->destroy = 0;
3047 init_timer(&newsk->timer);
3048 newsk->timer.data = (unsignedlong)newsk;
3049 newsk->timer.function = &net_timer;
3050 init_timer(&newsk->retransmit_timer);
3051 newsk->retransmit_timer.data = (unsignedlong)newsk;
3052 newsk->retransmit_timer.function=&retransmit_timer;
3053 newsk->dummy_th.source = skb->h.th->dest;
3054 newsk->dummy_th.dest = skb->h.th->source;
3055
3056 /*3057 * Swap these two, they are from our point of view. 3058 */3059
3060 newsk->daddr = saddr;
3061 newsk->saddr = daddr;
3062 newsk->rcv_saddr = daddr;
3063
3064 put_sock(newsk->num,newsk);
3065 newsk->dummy_th.res1 = 0;
3066 newsk->dummy_th.doff = 6;
3067 newsk->dummy_th.fin = 0;
3068 newsk->dummy_th.syn = 0;
3069 newsk->dummy_th.rst = 0;
3070 newsk->dummy_th.psh = 0;
3071 newsk->dummy_th.ack = 0;
3072 newsk->dummy_th.urg = 0;
3073 newsk->dummy_th.res2 = 0;
3074 newsk->acked_seq = skb->seq + 1;
3075 newsk->copied_seq = skb->seq + 1;
3076 newsk->socket = NULL;
3077
3078 /*3079 * Grab the ttl and tos values and use them 3080 */3081
3082 newsk->ip_ttl=sk->ip_ttl;
3083 newsk->ip_tos=skb->ip_hdr->tos;
3084
3085 /*3086 * Use 512 or whatever user asked for 3087 */3088
3089 /*3090 * Note use of sk->user_mss, since user has no direct access to newsk 3091 */3092
3093 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3094 newsk->ip_route_cache = rt;
3095
3096 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3097 newsk->window_clamp = rt->rt_window;
3098 else3099 newsk->window_clamp = 0;
3100
3101 if (sk->user_mss)
3102 newsk->mtu = sk->user_mss;
3103 elseif (rt)
3104 newsk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
3105 else3106 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
3107
3108 /*3109 * But not bigger than device MTU 3110 */3111
3112 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
3113
3114 #ifdefCONFIG_SKIP3115
3116 /*3117 * SKIP devices set their MTU to 65535. This is so they can take packets3118 * unfragmented to security process then fragment. They could lie to the3119 * TCP layer about a suitable MTU, but its easier to let skip sort it out3120 * simply because the final package we want unfragmented is going to be3121 *3122 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]3123 */3124
3125 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */3126 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3127 #endif3128 /*3129 * This will min with what arrived in the packet 3130 */3131
3132 tcp_options(newsk,skb->h.th);
3133
3134 tcp_cache_zap();
3135
3136 buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3137 if (buff == NULL)
3138 {3139 sk->err = ENOMEM;
3140 newsk->dead = 1;
3141 newsk->state = TCP_CLOSE;
3142 /* And this will destroy it */3143 release_sock(newsk);
3144 kfree_skb(skb, FREE_READ);
3145 tcp_statistics.TcpAttemptFails++;
3146 return;
3147 }3148
3149 buff->sk = newsk;
3150 buff->localroute = newsk->localroute;
3151
3152 /*3153 * Put in the IP header and routing stuff. 3154 */3155
3156 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3157 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3158
3159 /*3160 * Something went wrong. 3161 */3162
3163 if (tmp < 0)
3164 {3165 sk->err = tmp;
3166 buff->free = 1;
3167 kfree_skb(buff,FREE_WRITE);
3168 newsk->dead = 1;
3169 newsk->state = TCP_CLOSE;
3170 release_sock(newsk);
3171 skb->sk = sk;
3172 kfree_skb(skb, FREE_READ);
3173 tcp_statistics.TcpAttemptFails++;
3174 return;
3175 }3176
3177 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
3178
3179 memcpy(t1, skb->h.th, sizeof(*t1));
3180 buff->seq = newsk->write_seq++;
3181 buff->end_seq = newsk->write_seq;
3182 /*3183 * Swap the send and the receive. 3184 */3185 t1->dest = skb->h.th->source;
3186 t1->source = newsk->dummy_th.source;
3187 t1->seq = ntohl(buff->seq);
3188 t1->ack = 1;
3189 newsk->window = tcp_select_window(newsk);
3190 newsk->sent_seq = newsk->write_seq;
3191 t1->window = ntohs(newsk->window);
3192 t1->res1 = 0;
3193 t1->res2 = 0;
3194 t1->rst = 0;
3195 t1->urg = 0;
3196 t1->psh = 0;
3197 t1->syn = 1;
3198 t1->ack_seq = htonl(newsk->acked_seq);
3199 t1->doff = sizeof(*t1)/4+1;
3200 ptr = skb_put(buff,4);
3201 ptr[0] = 2;
3202 ptr[1] = 4;
3203 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3204 ptr[3] =(newsk->mtu) & 0xff;
3205
3206 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3207 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3208 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3209 skb->sk = newsk;
3210
3211 /*3212 * Charge the sock_buff to newsk. 3213 */3214
3215 sk->rmem_alloc -= skb->truesize;
3216 newsk->rmem_alloc += skb->truesize;
3217
3218 skb_queue_tail(&sk->receive_queue,skb);
3219 sk->ack_backlog++;
3220 release_sock(newsk);
3221 tcp_statistics.TcpOutSegs++;
3222 }3223
3224
3225 staticvoidtcp_close(structsock *sk, inttimeout)
/* */3226 {3227 /*3228 * We need to grab some memory, and put together a FIN, 3229 * and then put it into the queue to be sent.3230 */3231
3232 sk->inuse = 1;
3233
3234 if(th_cache_sk==sk)
3235 tcp_cache_zap();
3236 if(sk->state == TCP_LISTEN)
3237 {3238 /* Special case */3239 tcp_set_state(sk, TCP_CLOSE);
3240 tcp_close_pending(sk);
3241 release_sock(sk);
3242 return;
3243 }3244
3245 sk->keepopen = 1;
3246 sk->shutdown = SHUTDOWN_MASK;
3247
3248 if (!sk->dead)
3249 sk->state_change(sk);
3250
3251 if (timeout == 0)
3252 {3253 structsk_buff *skb;
3254
3255 /*3256 * We need to flush the recv. buffs. We do this only on the3257 * descriptor close, not protocol-sourced closes, because the3258 * reader process may not have drained the data yet!3259 */3260
3261 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3262 kfree_skb(skb, FREE_READ);
3263 /*3264 * Get rid off any half-completed packets. 3265 */3266
3267 if (sk->partial)
3268 tcp_send_partial(sk);
3269 }3270
3271
3272 /*3273 * Timeout is not the same thing - however the code likes3274 * to send both the same way (sigh).3275 */3276
3277 if(timeout)
3278 {3279 tcp_set_state(sk, TCP_CLOSE); /* Dead */3280 }3281 else3282 {3283 if(tcp_close_state(sk,1)==1)
3284 {3285 tcp_send_fin(sk);
3286 }3287 }3288 release_sock(sk);
3289 }3290
3291
3292 /*3293 * This routine takes stuff off of the write queue,3294 * and puts it in the xmit queue. This happens as incoming acks3295 * open up the remote window for us.3296 */3297
3298 staticvoidtcp_write_xmit(structsock *sk)
/* */3299 {3300 structsk_buff *skb;
3301
3302 /*3303 * The bytes will have to remain here. In time closedown will3304 * empty the write queue and all will be happy 3305 */3306
3307 if(sk->zapped)
3308 return;
3309
3310 /*3311 * Anything on the transmit queue that fits the window can3312 * be added providing we are not3313 *3314 * a) retransmitting (Nagle's rule)3315 * b) exceeding our congestion window.3316 */3317
3318 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3319 before(skb->end_seq, sk->window_seq + 1) &&
3320 (sk->retransmits == 0 ||
3321 sk->ip_xmit_timeout != TIME_WRITE ||
3322 before(skb->end_seq, sk->rcv_ack_seq + 1))
3323 && sk->packets_out < sk->cong_window)
3324 {3325 IS_SKB(skb);
3326 skb_unlink(skb);
3327
3328 /*3329 * See if we really need to send the packet. 3330 */3331
3332 if (before(skb->end_seq, sk->rcv_ack_seq +1))
3333 {3334 /*3335 * This is acked data. We can discard it. This 3336 * cannot currently occur.3337 */3338
3339 sk->retransmits = 0;
3340 kfree_skb(skb, FREE_WRITE);
3341 if (!sk->dead)
3342 sk->write_space(sk);
3343 }3344 else3345 {3346 structtcphdr *th;
3347 structiphdr *iph;
3348 intsize;
3349 /*3350 * put in the ack seq and window at this point rather than earlier,3351 * in order to keep them monotonic. We really want to avoid taking3352 * back window allocations. That's legal, but RFC1122 says it's frowned on.3353 * Ack and window will in general have changed since this packet was put3354 * on the write queue.3355 */3356 iph = skb->ip_hdr;
3357 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3358 size = skb->len - (((unsignedchar *) th) - skb->data);
3359 #ifndefCONFIG_NO_PATH_MTU_DISCOVERY3360 if (size > sk->mtu - sizeof(structiphdr))
3361 {3362 iph->frag_off &= ~htons(IP_DF);
3363 ip_send_check(iph);
3364 }3365 #endif3366
3367 th->ack_seq = htonl(sk->acked_seq);
3368 th->window = htons(tcp_select_window(sk));
3369
3370 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3371
3372 sk->sent_seq = skb->end_seq;
3373
3374 /*3375 * IP manages our queue for some crazy reason3376 */3377
3378 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3379
3380 /*3381 * Again we slide the timer wrongly3382 */3383
3384 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3385 }3386 }3387 }3388
3389
3390 /*3391 * This routine deals with incoming acks, but not outgoing ones.3392 */3393
3394 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3395 {3396 u32ack;
3397 intflag = 0;
3398
3399 /* 3400 * 1 - there was data in packet as well as ack or new data is sent or 3401 * in shutdown state3402 * 2 - data from retransmit queue was acked and removed3403 * 4 - window shrunk or data from retransmit queue was acked and removed3404 */3405
3406 if(sk->zapped)
3407 return(1); /* Dead, cant ack any more so why bother */3408
3409 /*3410 * Have we discovered a larger window3411 */3412
3413 ack = ntohl(th->ack_seq);
3414
3415 if (ntohs(th->window) > sk->max_window)
3416 {3417 sk->max_window = ntohs(th->window);
3418 #ifdefCONFIG_INET_PCTCP3419 /* Hack because we don't send partial packets to non SWS3420 handling hosts */3421 sk->mss = min(sk->max_window>>1, sk->mtu);
3422 #else3423 sk->mss = min(sk->max_window, sk->mtu);
3424 #endif3425 }3426
3427 /*3428 * We have dropped back to keepalive timeouts. Thus we have3429 * no retransmits pending.3430 */3431
3432 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3433 sk->retransmits = 0;
3434
3435 /*3436 * If the ack is newer than sent or older than previous acks3437 * then we can probably ignore it.3438 */3439
3440 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3441 {3442 if(sk->debug)
3443 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3444
3445 /*3446 * Keepalive processing.3447 */3448
3449 if (after(ack, sk->sent_seq))
3450 {3451 return(0);
3452 }3453
3454 /*3455 * Restart the keepalive timer.3456 */3457
3458 if (sk->keepopen)
3459 {3460 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3461 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3462 }3463 return(1);
3464 }3465
3466 /*3467 * If there is data set flag 13468 */3469
3470 if (len != th->doff*4)
3471 flag |= 1;
3472
3473 /*3474 * See if our window has been shrunk. 3475 */3476
3477 if (after(sk->window_seq, ack+ntohs(th->window)))
3478 {3479 /*3480 * We may need to move packets from the send queue3481 * to the write queue, if the window has been shrunk on us.3482 * The RFC says you are not allowed to shrink your window3483 * like this, but if the other end does, you must be able3484 * to deal with it.3485 */3486 structsk_buff *skb;
3487 structsk_buff *skb2;
3488 structsk_buff *wskb = NULL;
3489
3490 skb2 = sk->send_head;
3491 sk->send_head = NULL;
3492 sk->send_tail = NULL;
3493
3494 /*3495 * This is an artifact of a flawed concept. We want one3496 * queue and a smarter send routine when we send all.3497 */3498
3499 flag |= 4; /* Window changed */3500
3501 sk->window_seq = ack + ntohs(th->window);
3502 cli();
3503 while (skb2 != NULL)
3504 {3505 skb = skb2;
3506 skb2 = skb->link3;
3507 skb->link3 = NULL;
3508 if (after(skb->end_seq, sk->window_seq))
3509 {3510 if (sk->packets_out > 0)
3511 sk->packets_out--;
3512 /* We may need to remove this from the dev send list. */3513 if (skb->next != NULL)
3514 {3515 skb_unlink(skb);
3516 }3517 /* Now add it to the write_queue. */3518 if (wskb == NULL)
3519 skb_queue_head(&sk->write_queue,skb);
3520 else3521 skb_append(wskb,skb);
3522 wskb = skb;
3523 }3524 else3525 {3526 if (sk->send_head == NULL)
3527 {3528 sk->send_head = skb;
3529 sk->send_tail = skb;
3530 }3531 else3532 {3533 sk->send_tail->link3 = skb;
3534 sk->send_tail = skb;
3535 }3536 skb->link3 = NULL;
3537 }3538 }3539 sti();
3540 }3541
3542 /*3543 * Pipe has emptied3544 */3545
3546 if (sk->send_tail == NULL || sk->send_head == NULL)
3547 {3548 sk->send_head = NULL;
3549 sk->send_tail = NULL;
3550 sk->packets_out= 0;
3551 }3552
3553 /*3554 * Update the right hand window edge of the host3555 */3556
3557 sk->window_seq = ack + ntohs(th->window);
3558
3559 /*3560 * We don't want too many packets out there. 3561 */3562
3563 if (sk->ip_xmit_timeout == TIME_WRITE &&
3564 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3565 {3566 /* 3567 * This is Jacobson's slow start and congestion avoidance. 3568 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3569 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3570 * counter and increment it once every cwnd times. It's possible3571 * that this should be done only if sk->retransmits == 0. I'm3572 * interpreting "new data is acked" as including data that has3573 * been retransmitted but is just now being acked.3574 */3575 if (sk->cong_window < sk->ssthresh)
3576 /* 3577 * In "safe" area, increase3578 */3579 sk->cong_window++;
3580 else3581 {3582 /*3583 * In dangerous area, increase slowly. In theory this is3584 * sk->cong_window += 1 / sk->cong_window3585 */3586 if (sk->cong_count >= sk->cong_window)
3587 {3588 sk->cong_window++;
3589 sk->cong_count = 0;
3590 }3591 else3592 sk->cong_count++;
3593 }3594 }3595
3596 /*3597 * Remember the highest ack received.3598 */3599
3600 sk->rcv_ack_seq = ack;
3601
3602 /*3603 * We passed data and got it acked, remove any soft error3604 * log. Something worked...3605 */3606
3607 sk->err_soft = 0;
3608
3609 /*3610 * If this ack opens up a zero window, clear backoff. It was3611 * being used to time the probes, and is probably far higher than3612 * it needs to be for normal retransmission.3613 */3614
3615 if (sk->ip_xmit_timeout == TIME_PROBE0)
3616 {3617 sk->retransmits = 0; /* Our probe was answered */3618
3619 /*3620 * Was it a usable window open ?3621 */3622
3623 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3624 ! before (sk->window_seq, sk->write_queue.next->end_seq))
3625 {3626 sk->backoff = 0;
3627
3628 /*3629 * Recompute rto from rtt. this eliminates any backoff.3630 */3631
3632 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3633 if (sk->rto > 120*HZ)
3634 sk->rto = 120*HZ;
3635 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3636 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3637 .2 of a second is going to need huge windows (SIGH) */3638 sk->rto = 20;
3639 }3640 }3641
3642 /* 3643 * See if we can take anything off of the retransmit queue.3644 */3645
3646 while(sk->send_head != NULL)
3647 {3648 /* Check for a bug. */3649 if (sk->send_head->link3 &&
3650 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
3651 printk("INET: tcp.c: *** bug send_list out of order.\n");
3652
3653 /*3654 * If our packet is before the ack sequence we can3655 * discard it as it's confirmed to have arrived the other end.3656 */3657
3658 if (before(sk->send_head->end_seq, ack+1))
3659 {3660 structsk_buff *oskb;
3661 if (sk->retransmits)
3662 {3663 /*3664 * We were retransmitting. don't count this in RTT est 3665 */3666 flag |= 2;
3667
3668 /*3669 * even though we've gotten an ack, we're still3670 * retransmitting as long as we're sending from3671 * the retransmit queue. Keeping retransmits non-zero3672 * prevents us from getting new data interspersed with3673 * retransmissions.3674 */3675
3676 if (sk->send_head->link3) /* Any more queued retransmits? */3677 sk->retransmits = 1;
3678 else3679 sk->retransmits = 0;
3680 }3681 /*3682 * Note that we only reset backoff and rto in the3683 * rtt recomputation code. And that doesn't happen3684 * if there were retransmissions in effect. So the3685 * first new packet after the retransmissions is3686 * sent with the backoff still in effect. Not until3687 * we get an ack from a non-retransmitted packet do3688 * we reset the backoff and rto. This allows us to deal3689 * with a situation where the network delay has increased3690 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3691 */3692
3693 /*3694 * We have one less packet out there. 3695 */3696
3697 if (sk->packets_out > 0)
3698 sk->packets_out --;
3699 /* 3700 * Wake up the process, it can probably write more. 3701 */3702 if (!sk->dead)
3703 sk->write_space(sk);
3704 oskb = sk->send_head;
3705
3706 if (!(flag&2)) /* Not retransmitting */3707 {3708 longm;
3709
3710 /*3711 * The following amusing code comes from Jacobson's3712 * article in SIGCOMM '88. Note that rtt and mdev3713 * are scaled versions of rtt and mean deviation.3714 * This is designed to be as fast as possible 3715 * m stands for "measurement".3716 */3717
3718 m = jiffies - oskb->when; /* RTT */3719 if(m<=0)
3720 m=1; /* IS THIS RIGHT FOR <0 ??? */3721 m -= (sk->rtt >> 3); /* m is now error in rtt est */3722 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3723 if (m < 0)
3724 m = -m; /* m is now abs(error) */3725 m -= (sk->mdev >> 2); /* similar update on mdev */3726 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3727
3728 /*3729 * Now update timeout. Note that this removes any backoff.3730 */3731
3732 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3733 if (sk->rto > 120*HZ)
3734 sk->rto = 120*HZ;
3735 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3736 sk->rto = 20;
3737 sk->backoff = 0;
3738 }3739 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3740 In this case as we just set it up */3741 cli();
3742 oskb = sk->send_head;
3743 IS_SKB(oskb);
3744 sk->send_head = oskb->link3;
3745 if (sk->send_head == NULL)
3746 {3747 sk->send_tail = NULL;
3748 }3749
3750 /*3751 * We may need to remove this from the dev send list. 3752 */3753
3754 if (oskb->next)
3755 skb_unlink(oskb);
3756 sti();
3757 kfree_skb(oskb, FREE_WRITE); /* write. */3758 if (!sk->dead)
3759 sk->write_space(sk);
3760 }3761 else3762 {3763 break;
3764 }3765 }3766
3767 /*3768 * XXX someone ought to look at this too.. at the moment, if skb_peek()3769 * returns non-NULL, we complete ignore the timer stuff in the else3770 * clause. We ought to organize the code so that else clause can3771 * (should) be executed regardless, possibly moving the PROBE timer3772 * reset over. The skb_peek() thing should only move stuff to the3773 * write queue, NOT also manage the timer functions.3774 */3775
3776 /*3777 * Maybe we can take some stuff off of the write queue,3778 * and put it onto the xmit queue.3779 */3780 if (skb_peek(&sk->write_queue) != NULL)
3781 {3782 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3783 (sk->retransmits == 0 ||
3784 sk->ip_xmit_timeout != TIME_WRITE ||
3785 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3786 && sk->packets_out < sk->cong_window)
3787 {3788 /*3789 * Add more data to the send queue.3790 */3791 flag |= 1;
3792 tcp_write_xmit(sk);
3793 }3794 elseif (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3795 sk->send_head == NULL &&
3796 sk->ack_backlog == 0 &&
3797 sk->state != TCP_TIME_WAIT)
3798 {3799 /*3800 * Data to queue but no room.3801 */3802 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3803 }3804 }3805 else3806 {3807 /*3808 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3809 * from TCP_CLOSE we don't do anything3810 *3811 * from anything else, if there is write data (or fin) pending,3812 * we use a TIME_WRITE timeout, else if keepalive we reset to3813 * a KEEPALIVE timeout, else we delete the timer.3814 *3815 * We do not set flag for nominal write data, otherwise we may3816 * force a state where we start to write itsy bitsy tidbits3817 * of data.3818 */3819
3820 switch(sk->state) {3821 caseTCP_TIME_WAIT:
3822 /*3823 * keep us in TIME_WAIT until we stop getting packets,3824 * reset the timeout.3825 */3826 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3827 break;
3828 caseTCP_CLOSE:
3829 /*3830 * don't touch the timer.3831 */3832 break;
3833 default:
3834 /*3835 * Must check send_head, write_queue, and ack_backlog3836 * to determine which timeout to use.3837 */3838 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3839 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3840 }elseif (sk->keepopen) {3841 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3842 }else{3843 del_timer(&sk->retransmit_timer);
3844 sk->ip_xmit_timeout = 0;
3845 }3846 break;
3847 }3848 }3849
3850 /*3851 * We have nothing queued but space to send. Send any partial3852 * packets immediately (end of Nagle rule application).3853 */3854
3855 if (sk->packets_out == 0 && sk->partial != NULL &&
3856 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3857 {3858 flag |= 1;
3859 tcp_send_partial(sk);
3860 }3861
3862 /*3863 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3864 * we are now waiting for an acknowledge to our FIN. The other end is3865 * already in TIME_WAIT.3866 *3867 * Move to TCP_CLOSE on success.3868 */3869
3870 if (sk->state == TCP_LAST_ACK)
3871 {3872 if (!sk->dead)
3873 sk->state_change(sk);
3874 if(sk->debug)
3875 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3876 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3877 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3878 {3879 flag |= 1;
3880 tcp_set_state(sk,TCP_CLOSE);
3881 sk->shutdown = SHUTDOWN_MASK;
3882 }3883 }3884
3885 /*3886 * Incoming ACK to a FIN we sent in the case of our initiating the close.3887 *3888 * Move to FIN_WAIT2 to await a FIN from the other end. Set3889 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3890 */3891
3892 if (sk->state == TCP_FIN_WAIT1)
3893 {3894
3895 if (!sk->dead)
3896 sk->state_change(sk);
3897 if (sk->rcv_ack_seq == sk->write_seq)
3898 {3899 flag |= 1;
3900 sk->shutdown |= SEND_SHUTDOWN;
3901 tcp_set_state(sk, TCP_FIN_WAIT2);
3902 }3903 }3904
3905 /*3906 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3907 *3908 * Move to TIME_WAIT3909 */3910
3911 if (sk->state == TCP_CLOSING)
3912 {3913
3914 if (!sk->dead)
3915 sk->state_change(sk);
3916 if (sk->rcv_ack_seq == sk->write_seq)
3917 {3918 flag |= 1;
3919 tcp_time_wait(sk);
3920 }3921 }3922
3923 /*3924 * Final ack of a three way shake 3925 */3926
3927 if(sk->state==TCP_SYN_RECV)
3928 {3929 tcp_set_state(sk, TCP_ESTABLISHED);
3930 tcp_options(sk,th);
3931 sk->dummy_th.dest=th->source;
3932 sk->copied_seq = sk->acked_seq;
3933 if(!sk->dead)
3934 sk->state_change(sk);
3935 if(sk->max_window==0)
3936 {3937 sk->max_window=32; /* Sanity check */3938 sk->mss=min(sk->max_window,sk->mtu);
3939 }3940 }3941
3942 /*3943 * I make no guarantees about the first clause in the following3944 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3945 * what conditions "!flag" would be true. However I think the rest3946 * of the conditions would prevent that from causing any3947 * unnecessary retransmission. 3948 * Clearly if the first packet has expired it should be 3949 * retransmitted. The other alternative, "flag&2 && retransmits", is3950 * harder to explain: You have to look carefully at how and when the3951 * timer is set and with what timeout. The most recent transmission always3952 * sets the timer. So in general if the most recent thing has timed3953 * out, everything before it has as well. So we want to go ahead and3954 * retransmit some more. If we didn't explicitly test for this3955 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3956 * would not be true. If you look at the pattern of timing, you can3957 * show that rto is increased fast enough that the next packet would3958 * almost never be retransmitted immediately. Then you'd end up3959 * waiting for a timeout to send each packet on the retransmission3960 * queue. With my implementation of the Karn sampling algorithm,3961 * the timeout would double each time. The net result is that it would3962 * take a hideous amount of time to recover from a single dropped packet.3963 * It's possible that there should also be a test for TIME_WRITE, but3964 * I think as long as "send_head != NULL" and "retransmit" is on, we've3965 * got to be in real retransmission mode.3966 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3967 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3968 * As long as no further losses occur, this seems reasonable.3969 */3970
3971 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3972 (((flag&2) && sk->retransmits) ||
3973 (sk->send_head->when + sk->rto < jiffies)))
3974 {3975 if(sk->send_head->when + sk->rto < jiffies)
3976 tcp_retransmit(sk,0);
3977 else3978 {3979 tcp_do_retransmit(sk, 1);
3980 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3981 }3982 }3983
3984 return(1);
3985 }3986
3987
3988 /*3989 * Process the FIN bit. This now behaves as it is supposed to work3990 * and the FIN takes effect when it is validly part of sequence3991 * space. Not before when we get holes.3992 *3993 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3994 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3995 * TIME-WAIT)3996 *3997 * If we are in FINWAIT-1, a received FIN indicates simultaneous3998 * close and we go into CLOSING (and later onto TIME-WAIT)3999 *4000 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.4001 *4002 */4003
4004 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */4005 {4006 sk->fin_seq = skb->end_seq;
4007
4008 if (!sk->dead)
4009 {4010 sk->state_change(sk);
4011 sock_wake_async(sk->socket, 1);
4012 }4013
4014 switch(sk->state)
4015 {4016 caseTCP_SYN_RECV:
4017 caseTCP_SYN_SENT:
4018 caseTCP_ESTABLISHED:
4019 /*4020 * move to CLOSE_WAIT, tcp_data() already handled4021 * sending the ack.4022 */4023 tcp_set_state(sk,TCP_CLOSE_WAIT);
4024 if (th->rst)
4025 sk->shutdown = SHUTDOWN_MASK;
4026 break;
4027
4028 caseTCP_CLOSE_WAIT:
4029 caseTCP_CLOSING:
4030 /*4031 * received a retransmission of the FIN, do4032 * nothing.4033 */4034 break;
4035 caseTCP_TIME_WAIT:
4036 /*4037 * received a retransmission of the FIN,4038 * restart the TIME_WAIT timer.4039 */4040 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4041 return(0);
4042 caseTCP_FIN_WAIT1:
4043 /*4044 * This case occurs when a simultaneous close4045 * happens, we must ack the received FIN and4046 * enter the CLOSING state.4047 *4048 * This causes a WRITE timeout, which will either4049 * move on to TIME_WAIT when we timeout, or resend4050 * the FIN properly (maybe we get rid of that annoying4051 * FIN lost hang). The TIME_WRITE code is already correct4052 * for handling this timeout.4053 */4054
4055 if(sk->ip_xmit_timeout != TIME_WRITE)
4056 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4057 tcp_set_state(sk,TCP_CLOSING);
4058 break;
4059 caseTCP_FIN_WAIT2:
4060 /*4061 * received a FIN -- send ACK and enter TIME_WAIT4062 */4063 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4064 sk->shutdown|=SHUTDOWN_MASK;
4065 tcp_set_state(sk,TCP_TIME_WAIT);
4066 break;
4067 caseTCP_CLOSE:
4068 /*4069 * already in CLOSE4070 */4071 break;
4072 default:
4073 tcp_set_state(sk,TCP_LAST_ACK);
4074
4075 /* Start the timers. */4076 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4077 return(0);
4078 }4079
4080 return(0);
4081 }4082
4083
4084
4085 /*4086 * This routine handles the data. If there is room in the buffer,4087 * it will be have already been moved into it. If there is no4088 * room, then we will just have to discard the packet.4089 */4090
4091 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */4092 unsignedlongsaddr, unsignedshortlen)
4093 {4094 structsk_buff *skb1, *skb2;
4095 structtcphdr *th;
4096 intdup_dumped=0;
4097 u32new_seq, shut_seq;
4098
4099 th = skb->h.th;
4100 skb_pull(skb,th->doff*4);
4101 skb_trim(skb,len-(th->doff*4));
4102
4103 /*4104 * The bytes in the receive read/assembly queue has increased. Needed for the4105 * low memory discard algorithm 4106 */4107
4108 sk->bytes_rcv += skb->len;
4109
4110 if (skb->len == 0 && !th->fin)
4111 {4112 /* 4113 * Don't want to keep passing ack's back and forth. 4114 * (someone sent us dataless, boring frame)4115 */4116 if (!th->ack)
4117 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4118 kfree_skb(skb, FREE_READ);
4119 return(0);
4120 }4121
4122 /*4123 * We no longer have anyone receiving data on this connection.4124 */4125
4126 #ifndef TCP_DONT_RST_SHUTDOWN
4127
4128 if(sk->shutdown & RCV_SHUTDOWN)
4129 {4130 /*4131 * FIXME: BSD has some magic to avoid sending resets to4132 * broken 4.2 BSD keepalives. Much to my surprise a few non4133 * BSD stacks still have broken keepalives so we want to4134 * cope with it.4135 */4136
4137 if(skb->len) /* We don't care if it's just an ack or4138 a keepalive/window probe */4139 {4140 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */4141
4142 /* Do this the way 4.4BSD treats it. Not what I'd4143 regard as the meaning of the spec but it's what BSD4144 does and clearly they know everything 8) */4145
4146 /*4147 * This is valid because of two things4148 *4149 * a) The way tcp_data behaves at the bottom.4150 * b) A fin takes effect when read not when received.4151 */4152
4153 shut_seq = sk->acked_seq+1; /* Last byte */4154
4155 if(after(new_seq,shut_seq))
4156 {4157 if(sk->debug)
4158 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4159 sk, new_seq, shut_seq, sk->blog);
4160 if(sk->dead)
4161 {4162 sk->acked_seq = new_seq + th->fin;
4163 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4164 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4165 tcp_statistics.TcpEstabResets++;
4166 tcp_set_state(sk,TCP_CLOSE);
4167 sk->err = EPIPE;
4168 sk->shutdown = SHUTDOWN_MASK;
4169 kfree_skb(skb, FREE_READ);
4170 return 0;
4171 }4172 }4173 }4174 }4175
4176 #endif4177
4178 /*4179 * Now we have to walk the chain, and figure out where this one4180 * goes into it. This is set up so that the last packet we received4181 * will be the first one we look at, that way if everything comes4182 * in order, there will be no performance loss, and if they come4183 * out of order we will be able to fit things in nicely.4184 *4185 * [AC: This is wrong. We should assume in order first and then walk4186 * forwards from the first hole based upon real traffic patterns.]4187 * 4188 */4189
4190 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */4191 {4192 skb_queue_head(&sk->receive_queue,skb);
4193 skb1= NULL;
4194 }4195 else4196 {4197 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4198 {4199 if(sk->debug)
4200 {4201 printk("skb1=%p :", skb1);
4202 printk("skb1->seq = %d: ", skb1->seq);
4203 printk("skb->seq = %d\n",skb->seq);
4204 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4205 sk->acked_seq);
4206 }4207
4208 /*4209 * Optimisation: Duplicate frame or extension of previous frame from4210 * same sequence point (lost ack case).4211 * The frame contains duplicate data or replaces a previous frame4212 * discard the previous frame (safe as sk->inuse is set) and put4213 * the new one in its place.4214 */4215
4216 if (skb->seq==skb1->seq && skb->len>=skb1->len)
4217 {4218 skb_append(skb1,skb);
4219 skb_unlink(skb1);
4220 kfree_skb(skb1,FREE_READ);
4221 dup_dumped=1;
4222 skb1=NULL;
4223 break;
4224 }4225
4226 /*4227 * Found where it fits4228 */4229
4230 if (after(skb->seq+1, skb1->seq))
4231 {4232 skb_append(skb1,skb);
4233 break;
4234 }4235
4236 /*4237 * See if we've hit the start. If so insert.4238 */4239 if (skb1 == skb_peek(&sk->receive_queue))
4240 {4241 skb_queue_head(&sk->receive_queue, skb);
4242 break;
4243 }4244 }4245 }4246
4247 /*4248 * Figure out what the ack value for this frame is4249 */4250
4251 if (before(sk->acked_seq, sk->copied_seq))
4252 {4253 printk("*** tcp.c:tcp_data bug acked < copied\n");
4254 sk->acked_seq = sk->copied_seq;
4255 }4256
4257 /*4258 * Now figure out if we can ack anything. This is very messy because we really want two4259 * receive queues, a completed and an assembly queue. We also want only one transmit4260 * queue.4261 */4262
4263 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
4264 {4265 if (before(skb->seq, sk->acked_seq+1))
4266 {4267 intnewwindow;
4268
4269 if (after(skb->end_seq, sk->acked_seq))
4270 {4271 newwindow = sk->window - (skb->end_seq - sk->acked_seq);
4272 if (newwindow < 0)
4273 newwindow = 0;
4274 sk->window = newwindow;
4275 sk->acked_seq = skb->end_seq;
4276 }4277 skb->acked = 1;
4278
4279 /*4280 * When we ack the fin, we do the FIN 4281 * processing.4282 */4283
4284 if (skb->h.th->fin)
4285 {4286 tcp_fin(skb,sk,skb->h.th);
4287 }4288
4289 for(skb2 = skb->next;
4290 skb2 != (structsk_buff *)&sk->receive_queue;
4291 skb2 = skb2->next)
4292 {4293 if (before(skb2->seq, sk->acked_seq+1))
4294 {4295 if (after(skb2->end_seq, sk->acked_seq))
4296 {4297 newwindow = sk->window -
4298 (skb2->end_seq - sk->acked_seq);
4299 if (newwindow < 0)
4300 newwindow = 0;
4301 sk->window = newwindow;
4302 sk->acked_seq = skb2->end_seq;
4303 }4304 skb2->acked = 1;
4305 /*4306 * When we ack the fin, we do4307 * the fin handling.4308 */4309 if (skb2->h.th->fin)
4310 {4311 tcp_fin(skb,sk,skb->h.th);
4312 }4313
4314 /*4315 * Force an immediate ack.4316 */4317
4318 sk->ack_backlog = sk->max_ack_backlog;
4319 }4320 else4321 {4322 break;
4323 }4324 }4325
4326 /*4327 * This also takes care of updating the window.4328 * This if statement needs to be simplified.4329 */4330 if (!sk->delay_acks ||
4331 sk->ack_backlog >= sk->max_ack_backlog ||
4332 sk->bytes_rcv > sk->max_unacked || th->fin) {4333 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4334 }4335 else4336 {4337 sk->ack_backlog++;
4338 if(sk->debug)
4339 printk("Ack queued.\n");
4340 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4341 }4342 }4343 }4344
4345 /*4346 * If we've missed a packet, send an ack.4347 * Also start a timer to send another.4348 */4349
4350 if (!skb->acked)
4351 {4352
4353 /*4354 * This is important. If we don't have much room left,4355 * we need to throw out a few packets so we have a good4356 * window. Note that mtu is used, not mss, because mss is really4357 * for the send side. He could be sending us stuff as large as mtu.4358 */4359
4360 while (sock_rspace(sk) < sk->mtu)
4361 {4362 skb1 = skb_peek(&sk->receive_queue);
4363 if (skb1 == NULL)
4364 {4365 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4366 break;
4367 }4368
4369 /*4370 * Don't throw out something that has been acked. 4371 */4372
4373 if (skb1->acked)
4374 {4375 break;
4376 }4377
4378 skb_unlink(skb1);
4379 kfree_skb(skb1, FREE_READ);
4380 }4381 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4382 sk->ack_backlog++;
4383 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4384 }4385 else4386 {4387 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4388 }4389
4390 /*4391 * Now tell the user we may have some data. 4392 */4393
4394 if (!sk->dead)
4395 {4396 if(sk->debug)
4397 printk("Data wakeup.\n");
4398 sk->data_ready(sk,0);
4399 }4400 return(0);
4401 }4402
4403
4404 /*4405 * This routine is only called when we have urgent data4406 * signalled. Its the 'slow' part of tcp_urg. It could be4407 * moved inline now as tcp_urg is only called from one4408 * place. We handle URGent data wrong. We have to - as4409 * BSD still doesn't use the correction from RFC961.4410 */4411
4412 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4413 {4414 u32ptr = ntohs(th->urg_ptr);
4415
4416 if (ptr)
4417 ptr--;
4418 ptr += ntohl(th->seq);
4419
4420 /* ignore urgent data that we've already seen and read */4421 if (after(sk->copied_seq, ptr))
4422 return;
4423
4424 /* do we already have a newer (or duplicate) urgent pointer? */4425 if (sk->urg_data && !after(ptr, sk->urg_seq))
4426 return;
4427
4428 /* tell the world about our new urgent pointer */4429 if (sk->proc != 0) {4430 if (sk->proc > 0) {4431 kill_proc(sk->proc, SIGURG, 1);
4432 }else{4433 kill_pg(-sk->proc, SIGURG, 1);
4434 }4435 }4436 sk->urg_data = URG_NOTYET;
4437 sk->urg_seq = ptr;
4438 }4439
4440 /*4441 * This is the 'fast' part of urgent handling.4442 */4443
4444 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4445 unsignedlongsaddr, unsignedlonglen)
4446 {4447 u32ptr;
4448
4449 /*4450 * Check if we get a new urgent pointer - normally not 4451 */4452
4453 if (th->urg)
4454 tcp_check_urg(sk,th);
4455
4456 /*4457 * Do we wait for any urgent data? - normally not4458 */4459
4460 if (sk->urg_data != URG_NOTYET)
4461 return 0;
4462
4463 /*4464 * Is the urgent pointer pointing into this packet? 4465 */4466
4467 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4468 if (ptr >= len)
4469 return 0;
4470
4471 /*4472 * Ok, got the correct packet, update info 4473 */4474
4475 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4476 if (!sk->dead)
4477 sk->data_ready(sk,0);
4478 return 0;
4479 }4480
4481 /*4482 * This will accept the next outstanding connection. 4483 */4484
4485 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4486 {4487 structsock *newsk;
4488 structsk_buff *skb;
4489
4490 /*4491 * We need to make sure that this socket is listening,4492 * and that it has something pending.4493 */4494
4495 if (sk->state != TCP_LISTEN)
4496 {4497 sk->err = EINVAL;
4498 return(NULL);
4499 }4500
4501 /* Avoid the race. */4502 cli();
4503 sk->inuse = 1;
4504
4505 while((skb = tcp_dequeue_established(sk)) == NULL)
4506 {4507 if (flags & O_NONBLOCK)
4508 {4509 sti();
4510 release_sock(sk);
4511 sk->err = EAGAIN;
4512 return(NULL);
4513 }4514
4515 release_sock(sk);
4516 interruptible_sleep_on(sk->sleep);
4517 if (current->signal & ~current->blocked)
4518 {4519 sti();
4520 sk->err = ERESTARTSYS;
4521 return(NULL);
4522 }4523 sk->inuse = 1;
4524 }4525 sti();
4526
4527 /*4528 * Now all we need to do is return skb->sk. 4529 */4530
4531 newsk = skb->sk;
4532
4533 kfree_skb(skb, FREE_READ);
4534 sk->ack_backlog--;
4535 release_sock(sk);
4536 return(newsk);
4537 }4538
4539
4540 /*4541 * This will initiate an outgoing connection. 4542 */4543
4544 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4545 {4546 structsk_buff *buff;
4547 structdevice *dev=NULL;
4548 unsignedchar *ptr;
4549 inttmp;
4550 intatype;
4551 structtcphdr *t1;
4552 structrtable *rt;
4553
4554 if (sk->state != TCP_CLOSE)
4555 return(-EISCONN);
4556
4557 /*4558 * Don't allow a double connect.4559 */4560
4561 if(sk->daddr)
4562 return -EINVAL;
4563
4564 if (addr_len < 8)
4565 return(-EINVAL);
4566
4567 if (usin->sin_family && usin->sin_family != AF_INET)
4568 return(-EAFNOSUPPORT);
4569
4570 /*4571 * connect() to INADDR_ANY means loopback (BSD'ism).4572 */4573
4574 if(usin->sin_addr.s_addr==INADDR_ANY)
4575 usin->sin_addr.s_addr=ip_my_addr();
4576
4577 /*4578 * Don't want a TCP connection going to a broadcast address 4579 */4580
4581 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4582 return -ENETUNREACH;
4583
4584 sk->inuse = 1;
4585 sk->daddr = usin->sin_addr.s_addr;
4586 sk->write_seq = tcp_init_seq();
4587 sk->window_seq = sk->write_seq;
4588 sk->rcv_ack_seq = sk->write_seq -1;
4589 sk->err = 0;
4590 sk->dummy_th.dest = usin->sin_port;
4591 release_sock(sk);
4592
4593 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4594 if (buff == NULL)
4595 {4596 return(-ENOMEM);
4597 }4598 sk->inuse = 1;
4599 buff->sk = sk;
4600 buff->free = 0;
4601 buff->localroute = sk->localroute;
4602
4603
4604 /*4605 * Put in the IP header and routing stuff.4606 */4607
4608 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4609 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4610 if (tmp < 0)
4611 {4612 sock_wfree(sk, buff);
4613 release_sock(sk);
4614 return(-ENETUNREACH);
4615 }4616 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4617 sk->saddr = rt->rt_src;
4618 sk->rcv_saddr = sk->saddr;
4619
4620 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
4621
4622 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4623 buff->seq = sk->write_seq++;
4624 t1->seq = htonl(buff->seq);
4625 sk->sent_seq = sk->write_seq;
4626 buff->end_seq = sk->write_seq;
4627 t1->ack = 0;
4628 t1->window = 2;
4629 t1->res1=0;
4630 t1->res2=0;
4631 t1->rst = 0;
4632 t1->urg = 0;
4633 t1->psh = 0;
4634 t1->syn = 1;
4635 t1->urg_ptr = 0;
4636 t1->doff = 6;
4637 /* use 512 or whatever user asked for */4638
4639 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4640 sk->window_clamp=rt->rt_window;
4641 else4642 sk->window_clamp=0;
4643
4644 if (sk->user_mss)
4645 sk->mtu = sk->user_mss;
4646 elseif (rt)
4647 sk->mtu = rt->rt_mtu - sizeof(structiphdr) - sizeof(structtcphdr);
4648 else4649 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
4650
4651 /*4652 * but not bigger than device MTU 4653 */4654
4655 if(sk->mtu <32)
4656 sk->mtu = 32; /* Sanity limit */4657
4658 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
4659
4660 #ifdefCONFIG_SKIP4661
4662 /*4663 * SKIP devices set their MTU to 65535. This is so they can take packets4664 * unfragmented to security process then fragment. They could lie to the4665 * TCP layer about a suitable MTU, but its easier to let skip sort it out4666 * simply because the final package we want unfragmented is going to be4667 *4668 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]4669 */4670
4671 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */4672 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4673 #endif4674
4675 /*4676 * Put in the TCP options to say MTU. 4677 */4678
4679 ptr = skb_put(buff,4);
4680 ptr[0] = 2;
4681 ptr[1] = 4;
4682 ptr[2] = (sk->mtu) >> 8;
4683 ptr[3] = (sk->mtu) & 0xff;
4684 tcp_send_check(t1, sk->saddr, sk->daddr,
4685 sizeof(structtcphdr) + 4, sk);
4686
4687 /*4688 * This must go first otherwise a really quick response will get reset. 4689 */4690
4691 tcp_cache_zap();
4692 tcp_set_state(sk,TCP_SYN_SENT);
4693 if(rt&&rt->rt_flags&RTF_IRTT)
4694 sk->rto = rt->rt_irtt;
4695 else4696 sk->rto = TCP_TIMEOUT_INIT;
4697 sk->retransmit_timer.function=&retransmit_timer;
4698 sk->retransmit_timer.data = (unsignedlong)sk;
4699 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4700 sk->retransmits = 0; /* Now works the right way instead of a hacked 4701 initial setting */4702
4703 sk->prot->queue_xmit(sk, dev, buff, 0);
4704 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4705 tcp_statistics.TcpActiveOpens++;
4706 tcp_statistics.TcpOutSegs++;
4707
4708 release_sock(sk);
4709 return(0);
4710 }4711
4712
4713 /*4714 * This functions checks to see if the tcp header is actually acceptable. 4715 */4716
4717 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4718 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4719 {4720 u32next_seq;
4721
4722 next_seq = len - 4*th->doff;
4723 if (th->fin)
4724 next_seq++;
4725 /* if we have a zero window, we can't have any data in the packet.. */4726 if (next_seq && !sk->window)
4727 gotoignore_it;
4728 next_seq += ntohl(th->seq);
4729
4730 /*4731 * This isn't quite right. sk->acked_seq could be more recent4732 * than sk->window. This is however close enough. We will accept4733 * slightly more packets than we should, but it should not cause4734 * problems unless someone is trying to forge packets.4735 */4736
4737 /* have we already seen all of this packet? */4738 if (!after(next_seq+1, sk->acked_seq))
4739 gotoignore_it;
4740 /* or does it start beyond the window? */4741 if (!before(ntohl(th->seq), sk->acked_seq + sk->window + 1))
4742 gotoignore_it;
4743
4744 /* ok, at least part of this packet would seem interesting.. */4745 return 1;
4746
4747 ignore_it:
4748 if (th->rst)
4749 return 0;
4750
4751 /*4752 * Send a reset if we get something not ours and we are4753 * unsynchronized. Note: We don't do anything to our end. We4754 * are just killing the bogus remote connection then we will4755 * connect again and it will work (with luck).4756 */4757
4758 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4759 {4760 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4761 return 1;
4762 }4763
4764 /* Try to resync things. */4765 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4766 return 0;
4767 }4768
4769 /*4770 * When we get a reset we do this.4771 */4772
4773 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4774 {4775 sk->zapped = 1;
4776 sk->err = ECONNRESET;
4777 if (sk->state == TCP_SYN_SENT)
4778 sk->err = ECONNREFUSED;
4779 if (sk->state == TCP_CLOSE_WAIT)
4780 sk->err = EPIPE;
4781 #ifdef TCP_DO_RFC1337
4782 /*4783 * Time wait assassination protection [RFC1337]4784 */4785 if(sk->state!=TCP_TIME_WAIT)
4786 {4787 tcp_set_state(sk,TCP_CLOSE);
4788 sk->shutdown = SHUTDOWN_MASK;
4789 }4790 #else4791 tcp_set_state(sk,TCP_CLOSE);
4792 sk->shutdown = SHUTDOWN_MASK;
4793 #endif4794 if (!sk->dead)
4795 sk->state_change(sk);
4796 kfree_skb(skb, FREE_READ);
4797 release_sock(sk);
4798 return(0);
4799 }4800
4801 /*4802 * A TCP packet has arrived.4803 * skb->h.raw is the TCP header.4804 */4805
4806 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4807 __u32daddr, unsignedshortlen,
4808 __u32saddr, intredo, structinet_protocol * protocol)
4809 {4810 structtcphdr *th;
4811 structsock *sk;
4812 intsyn_ok=0;
4813
4814 tcp_statistics.TcpInSegs++;
4815 if(skb->pkt_type!=PACKET_HOST)
4816 {4817 kfree_skb(skb,FREE_READ);
4818 return(0);
4819 }4820
4821 th = skb->h.th;
4822
4823 /*4824 * Find the socket, using the last hit cache if applicable.4825 */4826
4827 if(!redo && saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4828 {4829 sk=(structsock *)th_cache_sk;
4830 /*4831 * We think this is causing the bug so4832 */4833 if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4834 printk("Cache mismatch on TCP.\n");
4835 }4836 else4837 {4838 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4839 th_cache_saddr=saddr;
4840 th_cache_daddr=daddr;
4841 th_cache_dport=th->dest;
4842 th_cache_sport=th->source;
4843 th_cache_sk=sk;
4844 }4845
4846 /*4847 * If this socket has got a reset it's to all intents and purposes 4848 * really dead. Count closed sockets as dead.4849 *4850 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4851 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4852 * exist so should cause resets as if the port was unreachable.4853 */4854
4855 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4856 sk=NULL;
4857
4858 if (!redo)
4859 {4860 /*4861 * Pull up the IP header.4862 */4863 skb_pull(skb, skb->h.raw-skb->data);
4864 /*4865 * Try to use the device checksum if provided.4866 */4867 if (
4868 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4869 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4870 )
4871 {4872 skb->sk = NULL;
4873 kfree_skb(skb,FREE_READ);
4874 /*4875 * We don't release the socket because it was4876 * never marked in use.4877 */4878 return(0);
4879 }4880
4881 skb->seq = ntohl(th->seq);
4882 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4883 skb->ack_seq = ntohl(th->ack_seq);
4884
4885 /* See if we know about the socket. */4886 if (sk == NULL)
4887 {4888 /*4889 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4890 */4891 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4892 skb->sk = NULL;
4893 /*4894 * Discard frame4895 */4896 kfree_skb(skb, FREE_READ);
4897 return(0);
4898 }4899
4900 skb->acked = 0;
4901 skb->used = 0;
4902 skb->free = 0;
4903 skb->saddr = daddr;
4904 skb->daddr = saddr;
4905
4906 /* We may need to add it to the backlog here. */4907 cli();
4908 if (sk->inuse)
4909 {4910 skb_queue_tail(&sk->back_log, skb);
4911 sti();
4912 return(0);
4913 }4914 sk->inuse = 1;
4915 sti();
4916 }4917 else4918 {4919 if (sk==NULL)
4920 {4921 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4922 skb->sk = NULL;
4923 kfree_skb(skb, FREE_READ);
4924 return(0);
4925 }4926 }4927
4928
4929 if (!sk->prot)
4930 {4931 printk("IMPOSSIBLE 3\n");
4932 return(0);
4933 }4934
4935
4936 /*4937 * Charge the memory to the socket. 4938 */4939
4940 skb->sk=sk;
4941 sk->rmem_alloc += skb->truesize;
4942
4943 /*4944 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4945 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4946 * compatibility. We also set up variables more thoroughly [Karn notes in the4947 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4948 */4949
4950 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4951 {4952
4953 /*4954 * Now deal with unusual cases.4955 */4956
4957 if(sk->state==TCP_LISTEN)
4958 {4959 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4960 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4961
4962 /*4963 * We don't care for RST, and non SYN are absorbed (old segments)4964 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4965 * netmask on a running connection it can go broadcast. Even Sun's have4966 * this problem so I'm ignoring it 4967 */4968
4969 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4970 {4971 kfree_skb(skb, FREE_READ);
4972 release_sock(sk);
4973 return 0;
4974 }4975
4976 /* 4977 * Guess we need to make a new socket up 4978 */4979
4980 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4981
4982 /*4983 * Now we have several options: In theory there is nothing else4984 * in the frame. KA9Q has an option to send data with the syn,4985 * BSD accepts data with the syn up to the [to be] advertised window4986 * and Solaris 2.1 gives you a protocol error. For now we just ignore4987 * it, that fits the spec precisely and avoids incompatibilities. It4988 * would be nice in future to drop through and process the data.4989 */4990
4991 release_sock(sk);
4992 return 0;
4993 }4994
4995 /* retransmitted SYN? */4996 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
4997 {4998 kfree_skb(skb, FREE_READ);
4999 release_sock(sk);
5000 return 0;
5001 }5002
5003 /*5004 * SYN sent means we have to look for a suitable ack and either reset5005 * for bad matches or go to connected 5006 */5007
5008 if(sk->state==TCP_SYN_SENT)
5009 {5010 /* Crossed SYN or previous junk segment */5011 if(th->ack)
5012 {5013 /* We got an ack, but it's not a good ack */5014 if(!tcp_ack(sk,th,saddr,len))
5015 {5016 /* Reset the ack - its an ack from a 5017 different connection [ th->rst is checked in tcp_reset()] */5018 tcp_statistics.TcpAttemptFails++;
5019 tcp_reset(daddr, saddr, th,
5020 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5021 kfree_skb(skb, FREE_READ);
5022 release_sock(sk);
5023 return(0);
5024 }5025 if(th->rst)
5026 returntcp_std_reset(sk,skb);
5027 if(!th->syn)
5028 {5029 /* A valid ack from a different connection5030 start. Shouldn't happen but cover it */5031 kfree_skb(skb, FREE_READ);
5032 release_sock(sk);
5033 return 0;
5034 }5035 /*5036 * Ok.. it's good. Set up sequence numbers and5037 * move to established.5038 */5039 syn_ok=1; /* Don't reset this connection for the syn */5040 sk->acked_seq = skb->seq+1;
5041 sk->fin_seq = skb->seq;
5042 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5043 tcp_set_state(sk, TCP_ESTABLISHED);
5044 tcp_options(sk,th);
5045 sk->dummy_th.dest=th->source;
5046 sk->copied_seq = sk->acked_seq;
5047 if(!sk->dead)
5048 {5049 sk->state_change(sk);
5050 sock_wake_async(sk->socket, 0);
5051 }5052 if(sk->max_window==0)
5053 {5054 sk->max_window = 32;
5055 sk->mss = min(sk->max_window, sk->mtu);
5056 }5057 }5058 else5059 {5060 /* See if SYN's cross. Drop if boring */5061 if(th->syn && !th->rst)
5062 {5063 /* Crossed SYN's are fine - but talking to5064 yourself is right out... */5065 if(sk->saddr==saddr && sk->daddr==daddr &&
5066 sk->dummy_th.source==th->source &&
5067 sk->dummy_th.dest==th->dest)
5068 {5069 tcp_statistics.TcpAttemptFails++;
5070 returntcp_std_reset(sk,skb);
5071 }5072 tcp_set_state(sk,TCP_SYN_RECV);
5073
5074 /*5075 * FIXME:5076 * Must send SYN|ACK here5077 */5078 }5079 /* Discard junk segment */5080 kfree_skb(skb, FREE_READ);
5081 release_sock(sk);
5082 return 0;
5083 }5084 /*5085 * SYN_RECV with data maybe.. drop through5086 */5087 gotorfc_step6;
5088 }5089
5090 /*5091 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is5092 * a more complex suggestion for fixing these reuse issues in RFC16445093 * but not yet ready for general use. Also see RFC1379.5094 */5095
5096 #defineBSD_TIME_WAIT5097 #ifdefBSD_TIME_WAIT5098 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5099 after(skb->seq, sk->acked_seq) && !th->rst)
5100 {5101 u32seq = sk->write_seq;
5102 if(sk->debug)
5103 printk("Doing a BSD time wait\n");
5104 tcp_statistics.TcpEstabResets++;
5105 sk->rmem_alloc -= skb->truesize;
5106 skb->sk = NULL;
5107 sk->err=ECONNRESET;
5108 tcp_set_state(sk, TCP_CLOSE);
5109 sk->shutdown = SHUTDOWN_MASK;
5110 release_sock(sk);
5111 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5112 if (sk && sk->state==TCP_LISTEN)
5113 {5114 sk->inuse=1;
5115 skb->sk = sk;
5116 sk->rmem_alloc += skb->truesize;
5117 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5118 release_sock(sk);
5119 return 0;
5120 }5121 kfree_skb(skb, FREE_READ);
5122 return 0;
5123 }5124 #endif5125 }5126
5127 /*5128 * We are now in normal data flow (see the step list in the RFC)5129 * Note most of these are inline now. I'll inline the lot when5130 * I have time to test it hard and look at what gcc outputs 5131 */5132
5133 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5134 {5135 kfree_skb(skb, FREE_READ);
5136 release_sock(sk);
5137 return 0;
5138 }5139
5140 if(th->rst)
5141 returntcp_std_reset(sk,skb);
5142
5143 /*5144 * !syn_ok is effectively the state test in RFC793.5145 */5146
5147 if(th->syn && !syn_ok)
5148 {5149 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5150 returntcp_std_reset(sk,skb);
5151 }5152
5153 /*5154 * Process the ACK5155 */5156
5157
5158 if(th->ack && !tcp_ack(sk,th,saddr,len))
5159 {5160 /*5161 * Our three way handshake failed.5162 */5163
5164 if(sk->state==TCP_SYN_RECV)
5165 {5166 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5167 }5168 kfree_skb(skb, FREE_READ);
5169 release_sock(sk);
5170 return 0;
5171 }5172
5173 rfc_step6: /* I'll clean this up later */5174
5175 /*5176 * If the accepted buffer put us over our queue size we5177 * now drop it (we must process the ack first to avoid5178 * deadlock cases).5179 */5180
5181 if (sk->rmem_alloc >= sk->rcvbuf)
5182 {5183 kfree_skb(skb, FREE_READ);
5184 release_sock(sk);
5185 return(0);
5186 }5187
5188
5189 /*5190 * Process urgent data5191 */5192
5193 if(tcp_urg(sk, th, saddr, len))
5194 {5195 kfree_skb(skb, FREE_READ);
5196 release_sock(sk);
5197 return 0;
5198 }5199
5200 /*5201 * Process the encapsulated data5202 */5203
5204 if(tcp_data(skb,sk, saddr, len))
5205 {5206 kfree_skb(skb, FREE_READ);
5207 release_sock(sk);
5208 return 0;
5209 }5210
5211 /*5212 * And done5213 */5214
5215 release_sock(sk);
5216 return 0;
5217 }5218
5219 /*5220 * This routine sends a packet with an out of date sequence5221 * number. It assumes the other end will try to ack it.5222 */5223
5224 staticvoidtcp_write_wakeup(structsock *sk)
/* */5225 {5226 structsk_buff *buff,*skb;
5227 structtcphdr *t1;
5228 structdevice *dev=NULL;
5229 inttmp;
5230
5231 if (sk->zapped)
5232 return; /* After a valid reset we can send no more */5233
5234 /*5235 * Write data can still be transmitted/retransmitted in the5236 * following states. If any other state is encountered, return.5237 * [listen/close will never occur here anyway]5238 */5239
5240 if (sk->state != TCP_ESTABLISHED &&
5241 sk->state != TCP_CLOSE_WAIT &&
5242 sk->state != TCP_FIN_WAIT1 &&
5243 sk->state != TCP_LAST_ACK &&
5244 sk->state != TCP_CLOSING5245 )
5246 {5247 return;
5248 }5249 if ( before(sk->sent_seq, sk->window_seq) &&
5250 (skb=skb_peek(&sk->write_queue)))
5251 {5252 /*5253 * We are probing the opening of a window5254 * but the window size is != 05255 * must have been a result SWS advoidance ( sender )5256 */5257
5258 structiphdr *iph;
5259 structtcphdr *th;
5260 structtcphdr *nth;
5261 unsignedlongwin_size;
5262 #if 0
5263 unsignedlong ow_size;
5264 #endif5265 void * tcp_data_start;
5266
5267 /*5268 * How many bytes can we send ?5269 */5270
5271 win_size = sk->window_seq - sk->sent_seq;
5272
5273 /*5274 * Recover the buffer pointers5275 */5276
5277 iph = (structiphdr *)skb->ip_hdr;
5278 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
5279
5280 /*5281 * Grab the data for a temporary frame5282 */5283
5284 buff = sock_wmalloc(sk, win_size + th->doff * 4 +
5285 (iph->ihl << 2) +
5286 sk->prot->max_header + 15,
5287 1, GFP_ATOMIC);
5288 if ( buff == NULL )
5289 return;
5290
5291 /* 5292 * If we strip the packet on the write queue we must5293 * be ready to retransmit this one 5294 */5295
5296 buff->free = /*0*/1;
5297
5298 buff->sk = sk;
5299 buff->localroute = sk->localroute;
5300
5301 /*5302 * Put headers on the new packet5303 */5304
5305 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5306 IPPROTO_TCP, sk->opt, buff->truesize,
5307 sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5308 if (tmp < 0)
5309 {5310 sock_wfree(sk, buff);
5311 return;
5312 }5313
5314 /*5315 * Move the TCP header over5316 */5317
5318 buff->dev = dev;
5319
5320 nth = (structtcphdr *) skb_put(buff,th->doff*4);
5321
5322 memcpy(nth, th, th->doff * 4);
5323
5324 /*5325 * Correct the new header5326 */5327
5328 nth->ack = 1;
5329 nth->ack_seq = htonl(sk->acked_seq);
5330 nth->window = htons(tcp_select_window(sk));
5331 nth->check = 0;
5332
5333 /*5334 * Find the first data byte.5335 */5336
5337 tcp_data_start = (char *) th + (th->doff << 2);
5338
5339 /*5340 * Add it to our new buffer5341 */5342
5343 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5344
5345 /*5346 * Remember our right edge sequence number.5347 */5348
5349 buff->end_seq = sk->sent_seq + win_size;
5350 sk->sent_seq = buff->end_seq; /* Hack */5351 if(th->urg && ntohs(th->urg_ptr) < win_size)
5352 nth->urg = 0;
5353
5354 /*5355 * Checksum the split buffer5356 */5357
5358 tcp_send_check(nth, sk->saddr, sk->daddr,
5359 nth->doff * 4 + win_size , sk);
5360 }5361 else5362 {5363 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5364 if (buff == NULL)
5365 return;
5366
5367 buff->free = 1;
5368 buff->sk = sk;
5369 buff->localroute = sk->localroute;
5370
5371 /*5372 * Put in the IP header and routing stuff. 5373 */5374
5375 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5376 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5377 if (tmp < 0)
5378 {5379 sock_wfree(sk, buff);
5380 return;
5381 }5382
5383 t1 = (structtcphdr *)skb_put(buff,sizeof(structtcphdr));
5384 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5385
5386 /*5387 * Use a previous sequence.5388 * This should cause the other end to send an ack.5389 */5390
5391 t1->seq = htonl(sk->sent_seq-1);
5392 t1->ack = 1;
5393 t1->res1= 0;
5394 t1->res2= 0;
5395 t1->rst = 0;
5396 t1->urg = 0;
5397 t1->psh = 0;
5398 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5399 t1->syn = 0;
5400 t1->ack_seq = htonl(sk->acked_seq);
5401 t1->window = htons(tcp_select_window(sk));
5402 t1->doff = sizeof(*t1)/4;
5403 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5404
5405 }5406
5407 /*5408 * Send it.5409 */5410
5411 sk->prot->queue_xmit(sk, dev, buff, 1);
5412 tcp_statistics.TcpOutSegs++;
5413 }5414
5415 /*5416 * A window probe timeout has occurred.5417 */5418
5419 voidtcp_send_probe0(structsock *sk)
/* */5420 {5421 if (sk->zapped)
5422 return; /* After a valid reset we can send no more */5423
5424 tcp_write_wakeup(sk);
5425
5426 sk->backoff++;
5427 sk->rto = min(sk->rto << 1, 120*HZ);
5428 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5429 sk->retransmits++;
5430 sk->prot->retransmits ++;
5431 }5432
5433 /*5434 * Socket option code for TCP. 5435 */5436
5437 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5438 {5439 intval,err;
5440
5441 if(level!=SOL_TCP)
5442 returnip_setsockopt(sk,level,optname,optval,optlen);
5443
5444 if (optval == NULL)
5445 return(-EINVAL);
5446
5447 err=verify_area(VERIFY_READ, optval, sizeof(int));
5448 if(err)
5449 returnerr;
5450
5451 val = get_user((int *)optval);
5452
5453 switch(optname)
5454 {5455 caseTCP_MAXSEG:
5456 /*5457 * values greater than interface MTU won't take effect. however at5458 * the point when this call is done we typically don't yet know5459 * which interface is going to be used5460 */5461 if(val<1||val>MAX_WINDOW)
5462 return -EINVAL;
5463 sk->user_mss=val;
5464 return 0;
5465 caseTCP_NODELAY:
5466 sk->nonagle=(val==0)?0:1;
5467 return 0;
5468 default:
5469 return(-ENOPROTOOPT);
5470 }5471 }5472
5473 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5474 {5475 intval,err;
5476
5477 if(level!=SOL_TCP)
5478 returnip_getsockopt(sk,level,optname,optval,optlen);
5479
5480 switch(optname)
5481 {5482 caseTCP_MAXSEG:
5483 val=sk->user_mss;
5484 break;
5485 caseTCP_NODELAY:
5486 val=sk->nonagle;
5487 break;
5488 default:
5489 return(-ENOPROTOOPT);
5490 }5491 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5492 if(err)
5493 returnerr;
5494 put_user(sizeof(int),(int *) optlen);
5495
5496 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5497 if(err)
5498 returnerr;
5499 put_user(val,(int *)optval);
5500
5501 return(0);
5502 }5503
5504
5505 structprototcp_prot = {5506 tcp_close,
5507 ip_build_header,
5508 tcp_connect,
5509 tcp_accept,
5510 ip_queue_xmit,
5511 tcp_retransmit,
5512 tcp_write_wakeup,
5513 tcp_read_wakeup,
5514 tcp_rcv,
5515 tcp_select,
5516 tcp_ioctl,
5517 NULL,
5518 tcp_shutdown,
5519 tcp_setsockopt,
5520 tcp_getsockopt,
5521 tcp_sendmsg,
5522 tcp_recvmsg,
5523 NULL, /* No special bind() */5524 128,
5525 0,
5526 "TCP",
5527 0, 0,
5528 {NULL,}5529 };