1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. select 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle select() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), select() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in selecting before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : Select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if stat is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications 178 * 179 * 180 * To Fix: 181 * Fast path the code. Two things here - fix the window calculation 182 * so it doesn't iterate over the queue, also spot packets with no funny 183 * options arriving in order and process directly. 184 * 185 * Implement RFC 1191 [Path MTU discovery] 186 * Look at the effect of implementing RFC 1337 suggestions and their impact. 187 * Rewrite output state machine to use a single queue and do low window 188 * situations as per the spec (RFC 1122) 189 * Speed up input assembly algorithm. 190 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 191 * could do with it working on IPv4 192 * User settable/learned rtt/max window/mtu 193 * Cope with MTU/device switches when retransmitting in tcp. 194 * Fix the window handling to use PR's new code. 195 * 196 * Change the fundamental structure to a single send queue maintained 197 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 198 * active routes too]). Cut the queue off in tcp_retransmit/ 199 * tcp_transmit. 200 * Change the receive queue to assemble as it goes. This lets us 201 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 202 * tcp_data/tcp_read as well as the window shrink crud. 203 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 204 * tcp_queue_skb seem obvious routines to extract. 205 * 206 * This program is free software; you can redistribute it and/or 207 * modify it under the terms of the GNU General Public License 208 * as published by the Free Software Foundation; either version 209 * 2 of the License, or(at your option) any later version. 210 * 211 * Description of States: 212 * 213 * TCP_SYN_SENT sent a connection request, waiting for ack 214 * 215 * TCP_SYN_RECV received a connection request, sent ack, 216 * waiting for final ack in three-way handshake. 217 * 218 * TCP_ESTABLISHED connection established 219 * 220 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 221 * transmission of remaining buffered data 222 * 223 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 224 * to shutdown 225 * 226 * TCP_CLOSING both sides have shutdown but we still have 227 * data we have to finish sending 228 * 229 * TCP_TIME_WAIT timeout to catch resent junk before entering 230 * closed, can only be entered from FIN_WAIT2 231 * or CLOSING. Required because the other end 232 * may not have gotten our last ACK causing it 233 * to retransmit the data packet (which we ignore) 234 * 235 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 236 * us to finish writing our data and to shutdown 237 * (we have to close() to move on to LAST_ACK) 238 * 239 * TCP_LAST_ACK out side has shutdown after remote has 240 * shutdown. There may still be data in our 241 * buffer that we have to finish sending 242 * 243 * TCP_CLOSE socket is finished 244 */ 245
246 /* 247 * RFC1122 status: 248 * NOTE: I'm not going to be doing comments in the code for this one except 249 * for violations and the like. tcp.c is just too big... If I say something 250 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 251 * with Alan. -- MS 950903 252 * 253 * Use of PSH (4.2.2.2) 254 * MAY aggregate data sent without the PSH flag. (does) 255 * MAY queue data recieved without the PSH flag. (does) 256 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 257 * MAY implement PSH on send calls. (doesn't, thus:) 258 * MUST NOT buffer data indefinitely (doesn't [1 second]) 259 * MUST set PSH on last segment (does) 260 * MAY pass received PSH to application layer (doesn't) 261 * SHOULD send maximum-sized segment whenever possible. (almost always does) 262 * 263 * Window Size (4.2.2.3, 4.2.2.16) 264 * MUST treat window size as an unsigned number (does) 265 * SHOULD treat window size as a 32-bit number (does not) 266 * MUST NOT shrink window once it is offered (does not normally) 267 * 268 * Urgent Pointer (4.2.2.4) 269 * **MUST point urgent pointer to last byte of urgent data (not right 270 * after). (doesn't, to be like BSD) 271 * MUST inform application layer asynchronously of incoming urgent 272 * data. (does) 273 * MUST provide application with means of determining the amount of 274 * urgent data pending. (does) 275 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 276 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 277 * [Follows BSD 1 byte of urgent data] 278 * 279 * TCP Options (4.2.2.5) 280 * MUST be able to recieve TCP options in any segment. (does) 281 * MUST ignore unsupported options (does) 282 * 283 * Maximum Segment Size Option (4.2.2.6) 284 * MUST implement both sending and receiving MSS. (does) 285 * SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send 286 * it always). (does, even when MSS == 536, which is legal) 287 * MUST assume MSS == 536 if no MSS received at connection setup (does) 288 * MUST calculate "effective send MSS" correctly: 289 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 290 * (does - but allows operator override) 291 * 292 * TCP Checksum (4.2.2.7) 293 * MUST generate and check TCP checksum. (does) 294 * 295 * Initial Sequence Number Selection (4.2.2.8) 296 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 297 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 298 * necessary for 10Mbps networks - and harder than BSD to spoof!) 299 * 300 * Simultaneous Open Attempts (4.2.2.10) 301 * MUST support simultaneous open attempts (does) 302 * 303 * Recovery from Old Duplicate SYN (4.2.2.11) 304 * MUST keep track of active vs. passive open (does) 305 * 306 * RST segment (4.2.2.12) 307 * SHOULD allow an RST segment to contain data (does, but doesn't do 308 * anything with it, which is standard) 309 * 310 * Closing a Connection (4.2.2.13) 311 * MUST inform application of whether connectin was closed by RST or 312 * normal close. (does) 313 * MAY allow "half-duplex" close (treat connection as closed for the 314 * local app, even before handshake is done). (does) 315 * MUST linger in TIME_WAIT for 2 * MSL (does) 316 * 317 * Retransmission Timeout (4.2.2.15) 318 * MUST implement Jacobson's slow start and congestion avoidance 319 * stuff. (does) 320 * 321 * Probing Zero Windows (4.2.2.17) 322 * MUST support probing of zero windows. (does) 323 * MAY keep offered window closed indefinitely. (does) 324 * MUST allow remote window to stay closed indefinitely. (does) 325 * 326 * Passive Open Calls (4.2.2.18) 327 * MUST NOT let new passive open affect other connections. (doesn't) 328 * MUST support passive opens (LISTENs) concurrently. (does) 329 * 330 * Time to Live (4.2.2.19) 331 * MUST make TCP TTL configurable. (does - IP_TTL option) 332 * 333 * Event Processing (4.2.2.20) 334 * SHOULD queue out-of-order segments. (does) 335 * MUST aggregate ACK segments whenever possible. (does but badly) 336 * 337 * Retransmission Timeout Calculation (4.2.3.1) 338 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 339 * calculation. (does, or at least explains them in the comments 8*b) 340 * SHOULD initialize RTO to 0 and RTT to 3. (does) 341 * 342 * When to Send an ACK Segment (4.2.3.2) 343 * SHOULD implement delayed ACK. (does not) 344 * MUST keep ACK delay < 0.5 sec. (N/A) 345 * 346 * When to Send a Window Update (4.2.3.3) 347 * MUST implement receiver-side SWS. (does) 348 * 349 * When to Send Data (4.2.3.4) 350 * MUST implement sender-side SWS. (does - imperfectly) 351 * SHOULD implement Nagle algorithm. (does) 352 * 353 * TCP Connection Failures (4.2.3.5) 354 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 355 * SHOULD inform application layer of soft errors. (doesn't) 356 * 357 * TCP Keep-Alives (4.2.3.6) 358 * MAY provide keep-alives. (does) 359 * MUST make keep-alives configurable on a per-connection basis. (does) 360 * MUST default to no keep-alives. (does) 361 * **MUST make keep-alive interval configurable. (doesn't) 362 * **MUST make default keep-alive interval > 2 hours. (doesn't) 363 * MUST NOT interpret failure to ACK keep-alive packet as dead 364 * connection. (doesn't) 365 * SHOULD send keep-alive with no data. (does) 366 * 367 * TCP Multihoming (4.2.3.7) 368 * MUST get source address from IP layer before sending first 369 * SYN. (does) 370 * MUST use same local address for all segments of a connection. (does) 371 * 372 * IP Options (4.2.3.8) 373 * (I don't think the IP layer sees the IP options, yet.) 374 * MUST ignore unsupported IP options. (does, I guess 8*b) 375 * MAY support Time Stamp and Record Route. (doesn't) 376 * **MUST allow application to specify a source route. (doesn't?) 377 * **MUST allow receieved Source Route option to set route for all future 378 * segments on this connection. (doesn't, not that I think it's a 379 * huge problem) 380 * 381 * ICMP messages (4.2.3.9) 382 * MUST act on ICMP errors. (does) 383 * MUST slow transmission upon receipt of a Source Quench. (does) 384 * MUST NOT abort connection upon receipt of soft Destination 385 * Unreachables (0, 1, 5), Time Exceededs and Parameter 386 * Problems. (doesn't) 387 * SHOULD report soft Destination Unreachables etc. to the 388 * application. (doesn't) 389 * SHOULD abort connection upon receipt of hard Destination Unreachable 390 * messages (2, 3, 4). (does) 391 * 392 * Remote Address Validation (4.2.3.10) 393 * MUST reject as an error OPEN for invalid remote IP address. (does) 394 * MUST ignore SYN with invalid source address. (does) 395 * MUST silently discard incoming SYN for broadcast/multicast 396 * address. (does) 397 * 398 * Asynchronous Reports (4.2.4.1) 399 * **MUST provide mechanism for reporting soft errors to application 400 * layer. (doesn't) 401 * 402 * Type of Service (4.2.4.2) 403 * MUST allow application layer to set Type of Service. (does IP_TOS) 404 * 405 * (Whew. -- MS 950903) 406 **/ 407
408 #include <linux/types.h>
409 #include <linux/sched.h>
410 #include <linux/mm.h>
411 #include <linux/time.h>
412 #include <linux/string.h>
413 #include <linux/config.h>
414 #include <linux/socket.h>
415 #include <linux/sockios.h>
416 #include <linux/termios.h>
417 #include <linux/in.h>
418 #include <linux/fcntl.h>
419 #include <linux/inet.h>
420 #include <linux/netdevice.h>
421 #include <net/snmp.h>
422 #include <net/ip.h>
423 #include <net/protocol.h>
424 #include <net/icmp.h>
425 #include <net/tcp.h>
426 #include <net/arp.h>
427 #include <linux/skbuff.h>
428 #include <net/sock.h>
429 #include <net/route.h>
430 #include <linux/errno.h>
431 #include <linux/timer.h>
432 #include <asm/system.h>
433 #include <asm/segment.h>
434 #include <linux/mm.h>
435 #include <net/checksum.h>
436
437 /* 438 * The MSL timer is the 'normal' timer. 439 */ 440
441 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
442
443 #define SEQ_TICK 3
444 unsignedlongseq_offset;
445 structtcp_mibtcp_statistics;
446
447 /* 448 * Cached last hit socket 449 */ 450
451 volatileunsignedlongth_cache_saddr,th_cache_daddr;
452 volatileunsignedshortth_cache_dport, th_cache_sport;
453 volatilestructsock *th_cache_sk;
454
455 voidtcp_cache_zap(void)
/* */ 456 { 457 unsignedlongflags;
458 save_flags(flags);
459 cli();
460 th_cache_saddr=0;
461 th_cache_daddr=0;
462 th_cache_dport=0;
463 th_cache_sport=0;
464 th_cache_sk=NULL;
465 restore_flags(flags);
466 } 467
468 staticvoidtcp_close(structsock *sk, inttimeout);
469
470
471 /* 472 * The less said about this the better, but it works and will do for 1.2 473 */ 474
475 staticstructwait_queue *master_select_wakeup;
476
477 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 478 { 479 if (a < b)
480 return(a);
481 return(b);
482 } 483
484 #undefSTATE_TRACE 485
486 #ifdefSTATE_TRACE 487 staticchar *statename[]={ 488 "Unused","Established","Syn Sent","Syn Recv",
489 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
490 "Close Wait","Last ACK","Listen","Closing"
491 };
492 #endif 493
494 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 495 { 496 if(sk->state==TCP_ESTABLISHED)
497 tcp_statistics.TcpCurrEstab--;
498 #ifdefSTATE_TRACE 499 if(sk->debug)
500 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
501 #endif 502 /* This is a hack but it doesn't occur often and it's going to 503 be a real to fix nicely */ 504
505 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
506 { 507 wake_up_interruptible(&master_select_wakeup);
508 } 509 sk->state=state;
510 if(state==TCP_ESTABLISHED)
511 tcp_statistics.TcpCurrEstab++;
512 if(sk->state==TCP_CLOSE)
513 tcp_cache_zap();
514 } 515
516 /* 517 * This routine picks a TCP windows for a socket based on 518 * the following constraints 519 * 520 * 1. The window can never be shrunk once it is offered (RFC 793) 521 * 2. We limit memory per socket 522 * 523 * For now we use NET2E3's heuristic of offering half the memory 524 * we have handy. All is not as bad as this seems however because 525 * of two things. Firstly we will bin packets even within the window 526 * in order to get the data we are waiting for into the memory limit. 527 * Secondly we bin common duplicate forms at receive time 528 * Better heuristics welcome 529 */ 530
531 inttcp_select_window(structsock *sk)
/* */ 532 { 533 intnew_window = sock_rspace(sk);
534
535 if(sk->window_clamp)
536 new_window=min(sk->window_clamp,new_window);
537 /* 538 * Two things are going on here. First, we don't ever offer a 539 * window less than min(sk->mss, MAX_WINDOW/2). This is the 540 * receiver side of SWS as specified in RFC1122. 541 * Second, we always give them at least the window they 542 * had before, in order to avoid retracting window. This 543 * is technically allowed, but RFC1122 advises against it and 544 * in practice it causes trouble. 545 * 546 * Fixme: This doesn't correctly handle the case where 547 * new_window > sk->window but not by enough to allow for the 548 * shift in sequence space. 549 */ 550 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
551 return(sk->window);
552 return(new_window);
553 } 554
555 /* 556 * Find someone to 'accept'. Must be called with 557 * sk->inuse=1 or cli() 558 */ 559
560 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 561 { 562 structsk_buff *p=skb_peek(&s->receive_queue);
563 if(p==NULL)
564 returnNULL;
565 do 566 { 567 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
568 returnp;
569 p=p->next;
570 } 571 while(p!=(structsk_buff *)&s->receive_queue);
572 returnNULL;
573 } 574
575 /* 576 * Remove a completed connection and return it. This is used by 577 * tcp_accept() to get connections from the queue. 578 */ 579
580 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 581 { 582 structsk_buff *skb;
583 unsignedlongflags;
584 save_flags(flags);
585 cli();
586 skb=tcp_find_established(s);
587 if(skb!=NULL)
588 skb_unlink(skb); /* Take it off the queue */ 589 restore_flags(flags);
590 returnskb;
591 } 592
593 /* 594 * This routine closes sockets which have been at least partially 595 * opened, but not yet accepted. Currently it is only called by 596 * tcp_close, and timeout mirrors the value there. 597 */ 598
599 staticvoidtcp_close_pending (structsock *sk)
/* */ 600 { 601 structsk_buff *skb;
602
603 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
604 { 605 skb->sk->dead=1;
606 tcp_close(skb->sk, 0);
607 kfree_skb(skb, FREE_READ);
608 } 609 return;
610 } 611
612 /* 613 * Enter the time wait state. 614 */ 615
616 staticvoidtcp_time_wait(structsock *sk)
/* */ 617 { 618 tcp_set_state(sk,TCP_TIME_WAIT);
619 sk->shutdown = SHUTDOWN_MASK;
620 if (!sk->dead)
621 sk->state_change(sk);
622 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
623 } 624
625 /* 626 * A socket has timed out on its send queue and wants to do a 627 * little retransmitting. Currently this means TCP. 628 */ 629
630 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 631 { 632 structsk_buff * skb;
633 structproto *prot;
634 structdevice *dev;
635 intct=0;
636 structrtable *rt;
637
638 prot = sk->prot;
639 skb = sk->send_head;
640
641 while (skb != NULL)
642 { 643 structtcphdr *th;
644 structiphdr *iph;
645 intsize;
646
647 dev = skb->dev;
648 IS_SKB(skb);
649 skb->when = jiffies;
650
651 /* 652 * Discard the surplus MAC header 653 */ 654
655 skb_pull(skb,((unsignedchar *)skb->ip_hdr)-skb->data);
656
657 /* 658 * In general it's OK just to use the old packet. However we 659 * need to use the current ack and window fields. Urg and 660 * urg_ptr could possibly stand to be updated as well, but we 661 * don't keep the necessary data. That shouldn't be a problem, 662 * if the other end is doing the right thing. Since we're 663 * changing the packet, we have to issue a new IP identifier. 664 */ 665
666 iph = (structiphdr *)skb->data;
667 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
668 size = ntohs(iph->tot_len) - (iph->ihl<<2);
669
670 /* 671 * Note: We ought to check for window limits here but 672 * currently this is done (less efficiently) elsewhere. 673 */ 674
675 iph->id = htons(ip_id_count++);
676 ip_send_check(iph);
677
678 /* 679 * Put a MAC header back on (may cause ARPing) 680 */ 681
682 if(skb->localroute)
683 rt=ip_rt_local(iph->daddr,NULL,NULL);
684 else 685 rt=ip_rt_route(iph->daddr,NULL,NULL);
686
687 if(rt==NULL) /* Deep poo */ 688 { 689 if(skb->sk)
690 { 691 skb->sk->err=ENETUNREACH;
692 skb->sk->error_report(skb->sk);
693 } 694 } 695 else 696 { 697 dev=rt->rt_dev;
698 skb->raddr=rt->rt_gateway;
699 if(skb->raddr==0)
700 skb->raddr=iph->daddr;
701 skb->dev=dev;
702 skb->arp=1;
703 if(dev->hard_header)
704 { 705 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
706 skb->arp=0;
707 } 708
709 /* 710 * This is not the right way to handle this. We have to 711 * issue an up to date window and ack report with this 712 * retransmit to keep the odd buggy tcp that relies on 713 * the fact BSD does this happy. 714 * We don't however need to recalculate the entire 715 * checksum, so someone wanting a small problem to play 716 * with might like to implement RFC1141/RFC1624 and speed 717 * this up by avoiding a full checksum. 718 */ 719
720 th->ack_seq = ntohl(sk->acked_seq);
721 th->window = ntohs(tcp_select_window(sk));
722 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
723
724 /* 725 * If the interface is (still) up and running, kick it. 726 */ 727
728 if (dev->flags & IFF_UP)
729 { 730 /* 731 * If the packet is still being sent by the device/protocol 732 * below then don't retransmit. This is both needed, and good - 733 * especially with connected mode AX.25 where it stops resends 734 * occurring of an as yet unsent anyway frame! 735 * We still add up the counts as the round trip time wants 736 * adjusting. 737 */ 738 if (sk && !skb_device_locked(skb))
739 { 740 /* Remove it from any existing driver queue first! */ 741 skb_unlink(skb);
742 /* Now queue it */ 743 ip_statistics.IpOutRequests++;
744 dev_queue_xmit(skb, dev, sk->priority);
745 } 746 } 747 } 748
749 /* 750 * Count retransmissions 751 */ 752
753 ct++;
754 sk->prot->retransmits ++;
755 tcp_statistics.TcpRetransSegs++;
756
757
758 /* 759 * Only one retransmit requested. 760 */ 761
762 if (!all)
763 break;
764
765 /* 766 * This should cut it off before we send too many packets. 767 */ 768
769 if (ct >= sk->cong_window)
770 break;
771 skb = skb->link3;
772 } 773 } 774
775 /* 776 * Reset the retransmission timer 777 */ 778
779 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 780 { 781 del_timer(&sk->retransmit_timer);
782 sk->ip_xmit_timeout = why;
783 if((int)when < 0)
784 { 785 when=3;
786 printk("Error: Negative timer in xmit_timer\n");
787 } 788 sk->retransmit_timer.expires=jiffies+when;
789 add_timer(&sk->retransmit_timer);
790 } 791
792 /* 793 * This is the normal code called for timeouts. It does the retransmission 794 * and then does backoff. tcp_do_retransmit is separated out because 795 * tcp_ack needs to send stuff from the retransmit queue without 796 * initiating a backoff. 797 */ 798
799
800 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 801 { 802 tcp_do_retransmit(sk, all);
803
804 /* 805 * Increase the timeout each time we retransmit. Note that 806 * we do not increase the rtt estimate. rto is initialized 807 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 808 * that doubling rto each time is the least we can get away with. 809 * In KA9Q, Karn uses this for the first few times, and then 810 * goes to quadratic. netBSD doubles, but only goes up to *64, 811 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 812 * defined in the protocol as the maximum possible RTT. I guess 813 * we'll have to use something other than TCP to talk to the 814 * University of Mars. 815 * 816 * PAWS allows us longer timeouts and large windows, so once 817 * implemented ftp to mars will work nicely. We will have to fix 818 * the 120 second clamps though! 819 */ 820
821 sk->retransmits++;
822 sk->prot->retransmits++;
823 sk->backoff++;
824 sk->rto = min(sk->rto << 1, 120*HZ);
825 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
826 } 827
828
829 /* 830 * A timer event has trigger a tcp retransmit timeout. The 831 * socket xmit queue is ready and set up to send. Because 832 * the ack receive code keeps the queue straight we do 833 * nothing clever here. 834 */ 835
836 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 837 { 838 if (all)
839 { 840 tcp_retransmit_time(sk, all);
841 return;
842 } 843
844 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 845 /* sk->ssthresh in theory can be zero. I guess that's OK */ 846 sk->cong_count = 0;
847
848 sk->cong_window = 1;
849
850 /* Do the actual retransmit. */ 851 tcp_retransmit_time(sk, all);
852 } 853
854 /* 855 * A write timeout has occurred. Process the after effects. 856 */ 857
858 staticinttcp_write_timeout(structsock *sk)
/* */ 859 { 860 /* 861 * Look for a 'soft' timeout. 862 */ 863 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
864 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
865 { 866 /* 867 * Attempt to recover if arp has changed (unlikely!) or 868 * a route has shifted (not supported prior to 1.3). 869 */ 870 arp_destroy (sk->daddr, 0);
871 /*ip_route_check (sk->daddr);*/ 872 } 873
874 /* 875 * Have we tried to SYN too many times (repent repent 8)) 876 */ 877
878 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
879 { 880 sk->err=ETIMEDOUT;
881 sk->error_report(sk);
882 del_timer(&sk->retransmit_timer);
883 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ 884 tcp_set_state(sk,TCP_CLOSE);
885 /* Don't FIN, we got nothing back */ 886 release_sock(sk);
887 return 0;
888 } 889 /* 890 * Has it gone just too far ? 891 */ 892 if (sk->retransmits > TCP_RETR2)
893 { 894 sk->err = ETIMEDOUT;
895 sk->error_report(sk);
896 del_timer(&sk->retransmit_timer);
897 /* 898 * Time wait the socket 899 */ 900 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
901 { 902 tcp_set_state(sk,TCP_TIME_WAIT);
903 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
904 } 905 else 906 { 907 /* 908 * Clean up time. 909 */ 910 tcp_set_state(sk, TCP_CLOSE);
911 release_sock(sk);
912 return 0;
913 } 914 } 915 return 1;
916 } 917
918 /* 919 * The TCP retransmit timer. This lacks a few small details. 920 * 921 * 1. An initial rtt timeout on the probe0 should cause what we can 922 * of the first write queue buffer to be split and sent. 923 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 924 * ETIMEDOUT if we know an additional 'soft' error caused this. 925 * tcp_err should save a 'soft error' for us. 926 */ 927
928 staticvoidretransmit_timer(unsignedlongdata)
/* */ 929 { 930 structsock *sk = (structsock*)data;
931 intwhy = sk->ip_xmit_timeout;
932
933 /* 934 * only process if socket is not in use 935 */ 936
937 cli();
938 if (sk->inuse || in_bh)
939 { 940 /* Try again in 1 second */ 941 sk->retransmit_timer.expires = jiffies+HZ;
942 add_timer(&sk->retransmit_timer);
943 sti();
944 return;
945 } 946
947 sk->inuse = 1;
948 sti();
949
950 /* Always see if we need to send an ack. */ 951
952 if (sk->ack_backlog && !sk->zapped)
953 { 954 sk->prot->read_wakeup (sk);
955 if (! sk->dead)
956 sk->data_ready(sk,0);
957 } 958
959 /* Now we need to figure out why the socket was on the timer. */ 960
961 switch (why)
962 { 963 /* Window probing */ 964 caseTIME_PROBE0:
965 tcp_send_probe0(sk);
966 tcp_write_timeout(sk);
967 break;
968 /* Retransmitting */ 969 caseTIME_WRITE:
970 /* It could be we got here because we needed to send an ack. 971 * So we need to check for that. 972 */ 973 { 974 structsk_buff *skb;
975 unsignedlongflags;
976
977 save_flags(flags);
978 cli();
979 skb = sk->send_head;
980 if (!skb)
981 { 982 restore_flags(flags);
983 } 984 else 985 { 986 /* 987 * Kicked by a delayed ack. Reset timer 988 * correctly now 989 */ 990 if (jiffies < skb->when + sk->rto)
991 { 992 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
993 restore_flags(flags);
994 break;
995 } 996 restore_flags(flags);
997 /* 998 * Retransmission 999 */1000 sk->retransmits++;
1001 sk->prot->retransmits++;
1002 sk->prot->retransmit (sk, 0);
1003 tcp_write_timeout(sk);
1004 }1005 break;
1006 }1007 /* Sending Keepalives */1008 caseTIME_KEEPOPEN:
1009 /* 1010 * this reset_timer() call is a hack, this is not1011 * how KEEPOPEN is supposed to work.1012 */1013 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1014
1015 /* Send something to keep the connection open. */1016 if (sk->prot->write_wakeup)
1017 sk->prot->write_wakeup (sk);
1018 sk->retransmits++;
1019 sk->prot->retransmits++;
1020 tcp_write_timeout(sk);
1021 break;
1022 default:
1023 printk ("rexmit_timer: timer expired - reason unknown\n");
1024 break;
1025 }1026 release_sock(sk);
1027 }1028
1029 /*1030 * This routine is called by the ICMP module when it gets some1031 * sort of error condition. If err < 0 then the socket should1032 * be closed and the error returned to the user. If err > 01033 * it's just the icmp type << 8 | icmp code. After adjustment1034 * header points to the first 8 bytes of the tcp header. We need1035 * to find the appropriate port.1036 */1037
1038 voidtcp_err(inttype, intcode, unsignedchar *header, __u32daddr,
/* */1039 __u32saddr, structinet_protocol *protocol)
1040 {1041 structtcphdr *th;
1042 structsock *sk;
1043 structiphdr *iph=(structiphdr *)header;
1044
1045 header+=4*iph->ihl;
1046
1047
1048 th =(structtcphdr *)header;
1049 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1050
1051 if (sk == NULL)
1052 return;
1053
1054 if (type == ICMP_SOURCE_QUENCH)
1055 {1056 /*1057 * FIXME:1058 * For now we will just trigger a linear backoff.1059 * The slow start code should cause a real backoff here.1060 */1061 if (sk->cong_window > 4)
1062 sk->cong_window--;
1063 return;
1064 }1065
1066 if (type == ICMP_PARAMETERPROB)
1067 {1068 sk->err=EPROTO;
1069 sk->error_report(sk);
1070 }1071
1072 /*1073 * If we've already connected we will keep trying1074 * until we time out, or the user gives up.1075 */1076
1077 if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1078 {1079 sk->err = icmp_err_convert[code].errno;
1080 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1081 {1082 tcp_statistics.TcpAttemptFails++;
1083 tcp_set_state(sk,TCP_CLOSE);
1084 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */1085 }1086 }1087 return;
1088 }1089
1090
1091 /*1092 * Walk down the receive queue counting readable data until we hit the end or we find a gap1093 * in the received data queue (ie a frame missing that needs sending to us). Not1094 * sorting using two queues as data arrives makes life so much harder.1095 */1096
1097 staticinttcp_readable(structsock *sk)
/* */1098 {1099 unsignedlongcounted;
1100 unsignedlongamount;
1101 structsk_buff *skb;
1102 intsum;
1103 unsignedlongflags;
1104
1105 if(sk && sk->debug)
1106 printk("tcp_readable: %p - ",sk);
1107
1108 save_flags(flags);
1109 cli();
1110 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1111 {1112 restore_flags(flags);
1113 if(sk && sk->debug)
1114 printk("empty\n");
1115 return(0);
1116 }1117
1118 counted = sk->copied_seq; /* Where we are at the moment */1119 amount = 0;
1120
1121 /* 1122 * Do until a push or until we are out of data. 1123 */1124
1125 do1126 {1127 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */1128 break;
1129 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */1130 if (skb->h.th->syn)
1131 sum++;
1132 if (sum > 0)
1133 {/* Add it up, move on */1134 amount += sum;
1135 if (skb->h.th->syn)
1136 amount--;
1137 counted += sum;
1138 }1139 /*1140 * Don't count urg data ... but do it in the right place!1141 * Consider: "old_data (ptr is here) URG PUSH data"1142 * The old code would stop at the first push because1143 * it counted the urg (amount==1) and then does amount--1144 * *after* the loop. This means tcp_readable() always1145 * returned zero if any URG PUSH was in the queue, even1146 * though there was normal data available. If we subtract1147 * the urg data right here, we even get it to work for more1148 * than one URG PUSH skb without normal data.1149 * This means that select() finally works now with urg data1150 * in the queue. Note that rlogin was never affected1151 * because it doesn't use select(); it uses two processes1152 * and a blocking read(). And the queue scan in tcp_read()1153 * was correct. Mike <pall@rz.uni-karlsruhe.de>1154 */1155 if (skb->h.th->urg)
1156 amount--; /* don't count urg data */1157 if (amount && skb->h.th->psh) break;
1158 skb = skb->next;
1159 }1160 while(skb != (structsk_buff *)&sk->receive_queue);
1161
1162 restore_flags(flags);
1163 if(sk->debug)
1164 printk("got %lu bytes.\n",amount);
1165 return(amount);
1166 }1167
1168 /*1169 * LISTEN is a special case for select..1170 */1171 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */1172 {1173 if (sel_type == SEL_IN) {1174 intretval;
1175
1176 sk->inuse = 1;
1177 retval = (tcp_find_established(sk) != NULL);
1178 release_sock(sk);
1179 if (!retval)
1180 select_wait(&master_select_wakeup,wait);
1181 returnretval;
1182 }1183 return 0;
1184 }1185
1186
1187 /*1188 * Wait for a TCP event.1189 *1190 * Note that we don't need to set "sk->inuse", as the upper select layers1191 * take care of normal races (between the test and the event) and we don't1192 * go look at any of the socket buffers directly.1193 */1194 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */1195 {1196 if (sk->state == TCP_LISTEN)
1197 returntcp_listen_select(sk, sel_type, wait);
1198
1199 switch(sel_type) {1200 caseSEL_IN:
1201 if (sk->err)
1202 return 1;
1203 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1204 break;
1205
1206 if (sk->shutdown & RCV_SHUTDOWN)
1207 return 1;
1208
1209 if (sk->acked_seq == sk->copied_seq)
1210 break;
1211
1212 if (sk->urg_seq != sk->copied_seq ||
1213 sk->acked_seq != sk->copied_seq+1 ||
1214 sk->urginline || !sk->urg_data)
1215 return 1;
1216 break;
1217
1218 caseSEL_OUT:
1219 if (sk->err)
1220 return 1;
1221 if (sk->shutdown & SEND_SHUTDOWN)
1222 return 0;
1223 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1224 break;
1225 /*1226 * This is now right thanks to a small fix1227 * by Matt Dillon.1228 */1229
1230 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1231 break;
1232 return 1;
1233
1234 caseSEL_EX:
1235 if (sk->urg_data)
1236 return 1;
1237 break;
1238 }1239 select_wait(sk->sleep, wait);
1240 return 0;
1241 }1242
1243 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */1244 {1245 interr;
1246 switch(cmd)
1247 {1248
1249 caseTIOCINQ:
1250 #ifdef FIXME /* FIXME: */1251 caseFIONREAD:
1252 #endif1253 {1254 unsignedlongamount;
1255
1256 if (sk->state == TCP_LISTEN)
1257 return(-EINVAL);
1258
1259 sk->inuse = 1;
1260 amount = tcp_readable(sk);
1261 release_sock(sk);
1262 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1263 if(err)
1264 returnerr;
1265 put_user(amount, (int *)arg);
1266 return(0);
1267 }1268 caseSIOCATMARK:
1269 {1270 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1271
1272 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1273 if (err)
1274 returnerr;
1275 put_user(answ,(int *) arg);
1276 return(0);
1277 }1278 caseTIOCOUTQ:
1279 {1280 unsignedlongamount;
1281
1282 if (sk->state == TCP_LISTEN) return(-EINVAL);
1283 amount = sock_wspace(sk);
1284 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1285 if(err)
1286 returnerr;
1287 put_user(amount, (int *)arg);
1288 return(0);
1289 }1290 default:
1291 return(-EINVAL);
1292 }1293 }1294
1295
1296 /*1297 * This routine computes a TCP checksum. 1298 *1299 * Modified January 1995 from a go-faster DOS routine by1300 * Jorge Cwik <jorge@laser.satlink.net>1301 */1302
1303 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1304 unsignedlongsaddr, unsignedlongdaddr, unsignedlongbase)
1305 {1306 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1307 }1308
1309
1310
1311 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1312 unsignedlongdaddr, intlen, structsock *sk)
1313 {1314 th->check = 0;
1315 th->check = tcp_check(th, len, saddr, daddr,
1316 csum_partial((char *)th,len,0));
1317 return;
1318 }1319
1320 /*1321 * This is the main buffer sending routine. We queue the buffer1322 * having checked it is sane seeming.1323 */1324
1325 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1326 {1327 intsize;
1328 structtcphdr * th = skb->h.th;
1329
1330 /*1331 * length of packet (not counting length of pre-tcp headers) 1332 */1333
1334 size = skb->len - ((unsignedchar *) th - skb->data);
1335
1336 /*1337 * Sanity check it.. 1338 */1339
1340 if (size < sizeof(structtcphdr) || size > skb->len)
1341 {1342 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1343 skb, skb->data, th, skb->len);
1344 kfree_skb(skb, FREE_WRITE);
1345 return;
1346 }1347
1348 /*1349 * If we have queued a header size packet.. (these crash a few1350 * tcp stacks if ack is not set)1351 */1352
1353 if (size == sizeof(structtcphdr))
1354 {1355 /* If it's got a syn or fin it's notionally included in the size..*/1356 if(!th->syn && !th->fin)
1357 {1358 printk("tcp_send_skb: attempt to queue a bogon.\n");
1359 kfree_skb(skb,FREE_WRITE);
1360 return;
1361 }1362 }1363
1364 /*1365 * Actual processing.1366 */1367
1368 tcp_statistics.TcpOutSegs++;
1369 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1370
1371 /*1372 * We must queue if1373 *1374 * a) The right edge of this frame exceeds the window1375 * b) We are retransmitting (Nagle's rule)1376 * c) We have too many packets 'in flight'1377 */1378
1379 if (after(skb->h.seq, sk->window_seq) ||
1380 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1381 sk->packets_out >= sk->cong_window)
1382 {1383 /* checksum will be supplied by tcp_write_xmit. So1384 * we shouldn't need to set it at all. I'm being paranoid */1385 th->check = 0;
1386 if (skb->next != NULL)
1387 {1388 printk("tcp_send_partial: next != NULL\n");
1389 skb_unlink(skb);
1390 }1391 skb_queue_tail(&sk->write_queue, skb);
1392
1393 /*1394 * If we don't fit we have to start the zero window1395 * probes. This is broken - we really need to do a partial1396 * send _first_ (This is what causes the Cisco and PC/TCP1397 * grief).1398 */1399
1400 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1401 sk->send_head == NULL && sk->ack_backlog == 0)
1402 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1403 }1404 else1405 {1406 /*1407 * This is going straight out1408 */1409
1410 th->ack_seq = ntohl(sk->acked_seq);
1411 th->window = ntohs(tcp_select_window(sk));
1412
1413 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1414
1415 sk->sent_seq = sk->write_seq;
1416
1417 /*1418 * This is mad. The tcp retransmit queue is put together1419 * by the ip layer. This causes half the problems with1420 * unroutable FIN's and other things.1421 */1422
1423 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1424
1425 /*1426 * Set for next retransmit based on expected ACK time.1427 * FIXME: We set this every time which means our 1428 * retransmits are really about a window behind.1429 */1430
1431 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1432 }1433 }1434
1435 /*1436 * Locking problems lead us to a messy situation where we can have1437 * multiple partially complete buffers queued up. This is really bad1438 * as we don't want to be sending partial buffers. Fix this with1439 * a semaphore or similar to lock tcp_write per socket.1440 *1441 * These routines are pretty self descriptive.1442 */1443
1444 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1445 {1446 structsk_buff * skb;
1447 unsignedlongflags;
1448
1449 save_flags(flags);
1450 cli();
1451 skb = sk->partial;
1452 if (skb) {1453 sk->partial = NULL;
1454 del_timer(&sk->partial_timer);
1455 }1456 restore_flags(flags);
1457 returnskb;
1458 }1459
1460 /*1461 * Empty the partial queue1462 */1463
1464 staticvoidtcp_send_partial(structsock *sk)
/* */1465 {1466 structsk_buff *skb;
1467
1468 if (sk == NULL)
1469 return;
1470 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1471 tcp_send_skb(sk, skb);
1472 }1473
1474 /*1475 * Queue a partial frame1476 */1477
1478 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1479 {1480 structsk_buff * tmp;
1481 unsignedlongflags;
1482
1483 save_flags(flags);
1484 cli();
1485 tmp = sk->partial;
1486 if (tmp)
1487 del_timer(&sk->partial_timer);
1488 sk->partial = skb;
1489 init_timer(&sk->partial_timer);
1490 /*1491 * Wait up to 1 second for the buffer to fill.1492 */1493 sk->partial_timer.expires = jiffies+HZ;
1494 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1495 sk->partial_timer.data = (unsignedlong) sk;
1496 add_timer(&sk->partial_timer);
1497 restore_flags(flags);
1498 if (tmp)
1499 tcp_send_skb(sk, tmp);
1500 }1501
1502
1503 /*1504 * This routine sends an ack and also updates the window. 1505 */1506
1507 staticvoidtcp_send_ack(u32sequence, u32ack,
/* */1508 structsock *sk,
1509 structtcphdr *th, unsignedlongdaddr)
1510 {1511 structsk_buff *buff;
1512 structtcphdr *t1;
1513 structdevice *dev = NULL;
1514 inttmp;
1515
1516 if(sk->zapped)
1517 return; /* We have been reset, we may not send again */1518
1519 /*1520 * We need to grab some memory, and put together an ack,1521 * and then put it into the queue to be sent.1522 */1523
1524 buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1525 if (buff == NULL)
1526 {1527 /* 1528 * Force it to send an ack. We don't have to do this1529 * (ACK is unreliable) but it's much better use of 1530 * bandwidth on slow links to send a spare ack than1531 * resend packets. 1532 */1533
1534 sk->ack_backlog++;
1535 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1536 {1537 reset_xmit_timer(sk, TIME_WRITE, HZ);
1538 }1539 return;
1540 }1541
1542 /*1543 * Assemble a suitable TCP frame1544 */1545
1546 buff->sk = sk;
1547 buff->localroute = sk->localroute;
1548
1549 /* 1550 * Put in the IP header and routing stuff. 1551 */1552
1553 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1554 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1555 if (tmp < 0)
1556 {1557 buff->free = 1;
1558 sock_wfree(sk, buff);
1559 return;
1560 }1561 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
1562
1563 memcpy(t1, th, sizeof(*t1));
1564
1565 /*1566 * Swap the send and the receive. 1567 */1568
1569 t1->dest = th->source;
1570 t1->source = th->dest;
1571 t1->seq = ntohl(sequence);
1572 t1->ack = 1;
1573 sk->window = tcp_select_window(sk);
1574 t1->window = ntohs(sk->window);
1575 t1->res1 = 0;
1576 t1->res2 = 0;
1577 t1->rst = 0;
1578 t1->urg = 0;
1579 t1->syn = 0;
1580 t1->psh = 0;
1581 t1->fin = 0;
1582
1583 /*1584 * If we have nothing queued for transmit and the transmit timer1585 * is on we are just doing an ACK timeout and need to switch1586 * to a keepalive.1587 */1588
1589 if (ack == sk->acked_seq)
1590 {1591 sk->ack_backlog = 0;
1592 sk->bytes_rcv = 0;
1593 sk->ack_timed = 0;
1594 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1595 && sk->ip_xmit_timeout == TIME_WRITE)
1596 {1597 if(sk->keepopen) {1598 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1599 }else{1600 delete_timer(sk);
1601 }1602 }1603 }1604
1605 /*1606 * Fill in the packet and send it1607 */1608
1609 t1->ack_seq = ntohl(ack);
1610 t1->doff = sizeof(*t1)/4;
1611 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1612 if (sk->debug)
1613 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1614 tcp_statistics.TcpOutSegs++;
1615 sk->prot->queue_xmit(sk, dev, buff, 1);
1616 }1617
1618
1619 /* 1620 * This routine builds a generic TCP header. 1621 */1622
1623 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1624 {1625
1626 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1627 th->seq = htonl(sk->write_seq);
1628 th->psh =(push == 0) ? 1 : 0;
1629 th->doff = sizeof(*th)/4;
1630 th->ack = 1;
1631 th->fin = 0;
1632 sk->ack_backlog = 0;
1633 sk->bytes_rcv = 0;
1634 sk->ack_timed = 0;
1635 th->ack_seq = htonl(sk->acked_seq);
1636 sk->window = tcp_select_window(sk);
1637 th->window = htons(sk->window);
1638
1639 return(sizeof(*th));
1640 }1641
1642 /*1643 * This routine copies from a user buffer into a socket,1644 * and starts the transmit system.1645 */1646
1647 staticinttcp_sendmsg(structsock *sk, structmsghdr *msg,
/* */1648 intlen, intnonblock, intflags)
1649 {1650 intcopied = 0;
1651 intcopy;
1652 inttmp;
1653 intseglen;
1654 intiovct=0;
1655 structsk_buff *skb;
1656 structsk_buff *send_tmp;
1657 structproto *prot;
1658 structdevice *dev = NULL;
1659 unsignedchar *from;
1660
1661 /*1662 * Do sanity checking for sendmsg/sendto/send1663 */1664
1665 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1666 return -EINVAL;
1667 if (msg->msg_name)
1668 {1669 structsockaddr_in *addr=(structsockaddr_in *)msg->msg_name;
1670 if(sk->state == TCP_CLOSE)
1671 return -ENOTCONN;
1672 if (msg->msg_namelen < sizeof(*addr))
1673 return -EINVAL;
1674 if (addr->sin_family && addr->sin_family != AF_INET)
1675 return -EINVAL;
1676 if (addr->sin_port != sk->dummy_th.dest)
1677 return -EISCONN;
1678 if (addr->sin_addr.s_addr != sk->daddr)
1679 return -EISCONN;
1680 }1681
1682 /*1683 * Ok commence sending1684 */1685
1686 while(iovct<msg->msg_iovlen)
1687 {1688 seglen=msg->msg_iov[iovct].iov_len;
1689 from=msg->msg_iov[iovct++].iov_base;
1690 sk->inuse=1;
1691 prot = sk->prot;
1692 while(seglen > 0)
1693 {1694 if (sk->err)
1695 {/* Stop on an error */1696 release_sock(sk);
1697 if (copied)
1698 return(copied);
1699 tmp = -sk->err;
1700 sk->err = 0;
1701 return(tmp);
1702 }1703
1704 /*1705 * First thing we do is make sure that we are established. 1706 */1707
1708 if (sk->shutdown & SEND_SHUTDOWN)
1709 {1710 release_sock(sk);
1711 sk->err = EPIPE;
1712 if (copied)
1713 return(copied);
1714 sk->err = 0;
1715 return(-EPIPE);
1716 }1717
1718 /* 1719 * Wait for a connection to finish.1720 */1721
1722 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1723 {1724 if (sk->err)
1725 {1726 release_sock(sk);
1727 if (copied)
1728 return(copied);
1729 tmp = -sk->err;
1730 sk->err = 0;
1731 return(tmp);
1732 }1733
1734 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1735 {1736 release_sock(sk);
1737 if (copied)
1738 return(copied);
1739
1740 if (sk->err)
1741 {1742 tmp = -sk->err;
1743 sk->err = 0;
1744 return(tmp);
1745 }1746
1747 if (sk->keepopen)
1748 {1749 send_sig(SIGPIPE, current, 0);
1750 }1751 return(-EPIPE);
1752 }1753
1754 if (nonblock || copied)
1755 {1756 release_sock(sk);
1757 if (copied)
1758 return(copied);
1759 return(-EAGAIN);
1760 }1761
1762 release_sock(sk);
1763 cli();
1764
1765 if (sk->state != TCP_ESTABLISHED &&
1766 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1767 {1768 interruptible_sleep_on(sk->sleep);
1769 if (current->signal & ~current->blocked)
1770 {1771 sti();
1772 if (copied)
1773 return(copied);
1774 return(-ERESTARTSYS);
1775 }1776 }1777 sk->inuse = 1;
1778 sti();
1779 }1780
1781 /*1782 * The following code can result in copy <= if sk->mss is ever1783 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1784 * sk->mtu is constant once SYN processing is finished. I.e. we1785 * had better not get here until we've seen his SYN and at least one1786 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1787 * But ESTABLISHED should guarantee that. sk->max_window is by definition1788 * non-decreasing. Note that any ioctl to set user_mss must be done1789 * before the exchange of SYN's. If the initial ack from the other1790 * end has a window of 0, max_window and thus mss will both be 0.1791 */1792
1793 /* 1794 * Now we need to check if we have a half built packet. 1795 */1796
1797 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1798 {1799 inthdrlen;
1800
1801 /* IP header + TCP header */1802 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1803 + sizeof(structtcphdr);
1804
1805 /* Add more stuff to the end of skb->len */1806 if (!(flags & MSG_OOB))
1807 {1808 copy = min(sk->mss - (skb->len - hdrlen), len);
1809 /* FIXME: this is really a bug. */1810 if (copy <= 0)
1811 {1812 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1813 copy = 0;
1814 }1815 memcpy_fromfs(skb_put(skb,copy), from, copy);
1816 from += copy;
1817 copied += copy;
1818 len -= copy;
1819 seglen -= copy;
1820 sk->write_seq += copy;
1821 seglen -= copy;
1822 }1823 if ((skb->len - hdrlen) >= sk->mss ||
1824 (flags & MSG_OOB) || !sk->packets_out)
1825 tcp_send_skb(sk, skb);
1826 else1827 tcp_enqueue_partial(skb, sk);
1828 continue;
1829 }1830
1831 /*1832 * We also need to worry about the window.1833 * If window < 1/2 the maximum window we've seen from this1834 * host, don't use it. This is sender side1835 * silly window prevention, as specified in RFC1122.1836 * (Note that this is different than earlier versions of1837 * SWS prevention, e.g. RFC813.). What we actually do is 1838 * use the whole MSS. Since the results in the right1839 * edge of the packet being outside the window, it will1840 * be queued for later rather than sent.1841 */1842
1843 copy = sk->window_seq - sk->write_seq;
1844 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1845 copy = sk->mss;
1846 if (copy > len)
1847 copy = len;
1848
1849 /*1850 * We should really check the window here also. 1851 */1852
1853 send_tmp = NULL;
1854 if (copy < sk->mss && !(flags & MSG_OOB))
1855 {1856 /*1857 * We will release the socket in case we sleep here. 1858 */1859 release_sock(sk);
1860 /*1861 * NB: following must be mtu, because mss can be increased.1862 * mss is always <= mtu 1863 */1864 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1865 sk->inuse = 1;
1866 send_tmp = skb;
1867 }1868 else1869 {1870 /*1871 * We will release the socket in case we sleep here. 1872 */1873 release_sock(sk);
1874 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1875 sk->inuse = 1;
1876 }1877
1878 /*1879 * If we didn't get any memory, we need to sleep. 1880 */1881
1882 if (skb == NULL)
1883 {1884 sk->socket->flags |= SO_NOSPACE;
1885 if (nonblock)
1886 {1887 release_sock(sk);
1888 if (copied)
1889 return(copied);
1890 return(-EAGAIN);
1891 }1892
1893 /*1894 * FIXME: here is another race condition. 1895 */1896
1897 tmp = sk->wmem_alloc;
1898 release_sock(sk);
1899 cli();
1900 /*1901 * Again we will try to avoid it. 1902 */1903 if (tmp <= sk->wmem_alloc &&
1904 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1905 && sk->err == 0)
1906 {1907 sk->socket->flags &= ~SO_NOSPACE;
1908 interruptible_sleep_on(sk->sleep);
1909 if (current->signal & ~current->blocked)
1910 {1911 sti();
1912 if (copied)
1913 return(copied);
1914 return(-ERESTARTSYS);
1915 }1916 }1917 sk->inuse = 1;
1918 sti();
1919 continue;
1920 }1921
1922 skb->sk = sk;
1923 skb->free = 0;
1924 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1925
1926 /*1927 * FIXME: we need to optimize this.1928 * Perhaps some hints here would be good.1929 */1930
1931 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1932 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1933 if (tmp < 0 )
1934 {1935 sock_wfree(sk, skb);
1936 release_sock(sk);
1937 if (copied)
1938 return(copied);
1939 return(tmp);
1940 }1941 skb->dev = dev;
1942 skb->h.th =(structtcphdr *)skb_put(skb,sizeof(structtcphdr));
1943 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1944 if (tmp < 0)
1945 {1946 sock_wfree(sk, skb);
1947 release_sock(sk);
1948 if (copied)
1949 return(copied);
1950 return(tmp);
1951 }1952
1953 if (flags & MSG_OOB)
1954 {1955 skb->h.th->urg = 1;
1956 skb->h.th->urg_ptr = ntohs(copy);
1957 }1958
1959 memcpy_fromfs(skb_put(skb,copy), from, copy);
1960
1961 from += copy;
1962 copied += copy;
1963 len -= copy;
1964 seglen -= copy;
1965 skb->free = 0;
1966 sk->write_seq += copy;
1967
1968 if (send_tmp != NULL && sk->packets_out)
1969 {1970 tcp_enqueue_partial(send_tmp, sk);
1971 continue;
1972 }1973 tcp_send_skb(sk, skb);
1974 }1975 }1976 sk->err = 0;
1977
1978 /*1979 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1980 * interactive fast network servers. It's meant to be on and1981 * it really improves the throughput though not the echo time1982 * on my slow slip link - Alan1983 */1984
1985 /*1986 * Avoid possible race on send_tmp - c/o Johannes Stille 1987 */1988
1989 if(sk->partial && ((!sk->packets_out)
1990 /* If not nagling we can send on the before case too.. */1991 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1992 ))
1993 tcp_send_partial(sk);
1994
1995 release_sock(sk);
1996 return(copied);
1997 }1998
1999 staticinttcp_sendto(structsock *sk, constunsignedchar *ubuf, intsize, intnoblock, unsignedflags,
/* */2000 structsockaddr_in *sin, intaddr_len)
2001 {2002 structioveciov;
2003 structmsghdrmsg;
2004
2005 iov.iov_base = (void *)ubuf;
2006 iov.iov_len = size;
2007
2008 msg.msg_name = (void *)sin;
2009 msg.msg_namelen = addr_len;
2010 msg.msg_accrights = NULL;
2011 msg.msg_iov = &iov;
2012 msg.msg_iovlen = 1;
2013
2014 returntcp_sendmsg(sk, &msg, size, noblock, flags);
2015 }2016
2017 staticinttcp_write(structsock *sk, constunsignedchar *ubuf, intsize, intnoblock, unsignedflags)
/* */2018 {2019 returntcp_sendto(sk,ubuf,size,noblock,flags,NULL,0);
2020 }2021
2022
2023 /*2024 * Send an ack if one is backlogged at this point. Ought to merge2025 * this with tcp_send_ack().2026 */2027
2028 staticvoidtcp_read_wakeup(structsock *sk)
/* */2029 {2030 inttmp;
2031 structdevice *dev = NULL;
2032 structtcphdr *t1;
2033 structsk_buff *buff;
2034
2035 if (!sk->ack_backlog)
2036 return;
2037
2038 /*2039 * If we're closed, don't send an ack, or we'll get a RST2040 * from the closed destination.2041 */2042 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2043 return;
2044
2045 /*2046 * FIXME: we need to put code here to prevent this routine from2047 * being called. Being called once in a while is ok, so only check2048 * if this is the second time in a row.2049 */2050
2051 /*2052 * We need to grab some memory, and put together an ack,2053 * and then put it into the queue to be sent.2054 */2055
2056 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2057 if (buff == NULL)
2058 {2059 /* Try again real soon. */2060 reset_xmit_timer(sk, TIME_WRITE, HZ);
2061 return;
2062 }2063
2064 buff->sk = sk;
2065 buff->localroute = sk->localroute;
2066
2067 /*2068 * Put in the IP header and routing stuff. 2069 */2070
2071 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2072 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2073 if (tmp < 0)
2074 {2075 buff->free = 1;
2076 sock_wfree(sk, buff);
2077 return;
2078 }2079
2080 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2081
2082 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2083 t1->seq = htonl(sk->sent_seq);
2084 t1->ack = 1;
2085 t1->res1 = 0;
2086 t1->res2 = 0;
2087 t1->rst = 0;
2088 t1->urg = 0;
2089 t1->syn = 0;
2090 t1->psh = 0;
2091 sk->ack_backlog = 0;
2092 sk->bytes_rcv = 0;
2093 sk->window = tcp_select_window(sk);
2094 t1->window = ntohs(sk->window);
2095 t1->ack_seq = ntohl(sk->acked_seq);
2096 t1->doff = sizeof(*t1)/4;
2097 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2098 sk->prot->queue_xmit(sk, dev, buff, 1);
2099 tcp_statistics.TcpOutSegs++;
2100 }2101
2102
2103 /*2104 * FIXME:2105 * This routine frees used buffers.2106 * It should consider sending an ACK to let the2107 * other end know we now have a bigger window.2108 */2109
2110 staticvoidcleanup_rbuf(structsock *sk)
/* */2111 {2112 unsignedlongflags;
2113 unsignedlongleft;
2114 structsk_buff *skb;
2115 unsignedlongrspace;
2116
2117 if(sk->debug)
2118 printk("cleaning rbuf for sk=%p\n", sk);
2119
2120 save_flags(flags);
2121 cli();
2122
2123 left = sock_rspace(sk);
2124
2125 /*2126 * We have to loop through all the buffer headers,2127 * and try to free up all the space we can.2128 */2129
2130 while((skb=skb_peek(&sk->receive_queue)) != NULL)
2131 {2132 if (!skb->used || skb->users)
2133 break;
2134 skb_unlink(skb);
2135 skb->sk = sk;
2136 kfree_skb(skb, FREE_READ);
2137 }2138
2139 restore_flags(flags);
2140
2141 /*2142 * FIXME:2143 * At this point we should send an ack if the difference2144 * in the window, and the amount of space is bigger than2145 * TCP_WINDOW_DIFF.2146 */2147
2148 if(sk->debug)
2149 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2150 left);
2151 if ((rspace=sock_rspace(sk)) != left)
2152 {2153 /*2154 * This area has caused the most trouble. The current strategy2155 * is to simply do nothing if the other end has room to send at2156 * least 3 full packets, because the ack from those will auto-2157 * matically update the window. If the other end doesn't think2158 * we have much space left, but we have room for at least 1 more2159 * complete packet than it thinks we do, we will send an ack2160 * immediately. Otherwise we will wait up to .5 seconds in case2161 * the user reads some more.2162 */2163 sk->ack_backlog++;
2164 /*2165 * It's unclear whether to use sk->mtu or sk->mss here. They differ only2166 * if the other end is offering a window smaller than the agreed on MSS2167 * (called sk->mtu here). In theory there's no connection between send2168 * and receive, and so no reason to think that they're going to send2169 * small packets. For the moment I'm using the hack of reducing the mss2170 * only on the send side, so I'm putting mtu here.2171 */2172
2173 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2174 {2175 /* Send an ack right now. */2176 tcp_read_wakeup(sk);
2177 }2178 else2179 {2180 /* Force it to send an ack soon. */2181 intwas_active = del_timer(&sk->retransmit_timer);
2182 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2183 {2184 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2185 }2186 else2187 add_timer(&sk->retransmit_timer);
2188 }2189 }2190 }2191
2192
2193 /*2194 * Handle reading urgent data. BSD has very simple semantics for2195 * this, no blocking and very strange errors 8)2196 */2197
2198 staticinttcp_recv_urg(structsock * sk, intnonblock,
/* */2199 structmsghdr *msg, intlen, intflags, int *addr_len)
2200 {2201 /*2202 * No URG data to read2203 */2204 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2205 return -EINVAL; /* Yes this is right ! */2206
2207 if (sk->err)
2208 {2209 inttmp = -sk->err;
2210 sk->err = 0;
2211 returntmp;
2212 }2213
2214 if (sk->state == TCP_CLOSE || sk->done)
2215 {2216 if (!sk->done)
2217 {2218 sk->done = 1;
2219 return 0;
2220 }2221 return -ENOTCONN;
2222 }2223
2224 if (sk->shutdown & RCV_SHUTDOWN)
2225 {2226 sk->done = 1;
2227 return 0;
2228 }2229 sk->inuse = 1;
2230 if (sk->urg_data & URG_VALID)
2231 {2232 charc = sk->urg_data;
2233 if (!(flags & MSG_PEEK))
2234 sk->urg_data = URG_READ;
2235 memcpy_toiovec(msg->msg_iov, &c, 1);
2236 if(msg->msg_name)
2237 {2238 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2239 sin->sin_family=AF_INET;
2240 sin->sin_addr.s_addr=sk->daddr;
2241 sin->sin_port=sk->dummy_th.dest;
2242 }2243 if(addr_len)
2244 *addr_len=sizeof(structsockaddr_in);
2245 release_sock(sk);
2246 return 1;
2247 }2248 release_sock(sk);
2249
2250 /*2251 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2252 * the available implementations agree in this case:2253 * this call should never block, independent of the2254 * blocking state of the socket.2255 * Mike <pall@rz.uni-karlsruhe.de>2256 */2257 return -EAGAIN;
2258 }2259
2260
2261 /*2262 * This routine copies from a sock struct into the user buffer. 2263 */2264
2265 staticinttcp_recvmsg(structsock *sk, structmsghdr *msg,
/* */2266 intlen, intnonblock, intflags, int *addr_len)
2267 {2268 structwait_queuewait = {current, NULL};
2269 intcopied = 0;
2270 u32peek_seq;
2271 volatileu32 *seq; /* So gcc doesn't overoptimise */2272 unsignedlongused;
2273
2274 /* 2275 * This error should be checked. 2276 */2277
2278 if (sk->state == TCP_LISTEN)
2279 return -ENOTCONN;
2280
2281 /*2282 * Urgent data needs to be handled specially. 2283 */2284
2285 if (flags & MSG_OOB)
2286 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2287
2288 /*2289 * Copying sequence to update. This is volatile to handle2290 * the multi-reader case neatly (memcpy_to/fromfs might be 2291 * inline and thus not flush cached variables otherwise).2292 */2293
2294 peek_seq = sk->copied_seq;
2295 seq = &sk->copied_seq;
2296 if (flags & MSG_PEEK)
2297 seq = &peek_seq;
2298
2299 add_wait_queue(sk->sleep, &wait);
2300 sk->inuse = 1;
2301 while (len > 0)
2302 {2303 structsk_buff * skb;
2304 u32offset;
2305
2306 /*2307 * Are we at urgent data? Stop if we have read anything.2308 */2309
2310 if (copied && sk->urg_data && sk->urg_seq == *seq)
2311 break;
2312
2313 /*2314 * Next get a buffer.2315 */2316
2317 current->state = TASK_INTERRUPTIBLE;
2318
2319 skb = skb_peek(&sk->receive_queue);
2320 do2321 {2322 if (!skb)
2323 break;
2324 if (before(*seq, skb->h.th->seq))
2325 break;
2326 offset = *seq - skb->h.th->seq;
2327 if (skb->h.th->syn)
2328 offset--;
2329 if (offset < skb->len)
2330 gotofound_ok_skb;
2331 if (skb->h.th->fin)
2332 gotofound_fin_ok;
2333 if (!(flags & MSG_PEEK))
2334 skb->used = 1;
2335 skb = skb->next;
2336 }2337 while (skb != (structsk_buff *)&sk->receive_queue);
2338
2339 if (copied)
2340 break;
2341
2342 if (sk->err)
2343 {2344 copied = -sk->err;
2345 sk->err = 0;
2346 break;
2347 }2348
2349 if (sk->state == TCP_CLOSE)
2350 {2351 if (!sk->done)
2352 {2353 sk->done = 1;
2354 break;
2355 }2356 copied = -ENOTCONN;
2357 break;
2358 }2359
2360 if (sk->shutdown & RCV_SHUTDOWN)
2361 {2362 sk->done = 1;
2363 break;
2364 }2365
2366 if (nonblock)
2367 {2368 copied = -EAGAIN;
2369 break;
2370 }2371
2372 cleanup_rbuf(sk);
2373 release_sock(sk);
2374 sk->socket->flags |= SO_WAITDATA;
2375 schedule();
2376 sk->socket->flags &= ~SO_WAITDATA;
2377 sk->inuse = 1;
2378
2379 if (current->signal & ~current->blocked)
2380 {2381 copied = -ERESTARTSYS;
2382 break;
2383 }2384 continue;
2385
2386 found_ok_skb:
2387 /*2388 * Lock the buffer. We can be fairly relaxed as2389 * an interrupt will never steal a buffer we are 2390 * using unless I've missed something serious in2391 * tcp_data.2392 */2393
2394 skb->users++;
2395
2396 /*2397 * Ok so how much can we use ? 2398 */2399
2400 used = skb->len - offset;
2401 if (len < used)
2402 used = len;
2403 /*2404 * Do we have urgent data here? 2405 */2406
2407 if (sk->urg_data)
2408 {2409 u32urg_offset = sk->urg_seq - *seq;
2410 if (urg_offset < used)
2411 {2412 if (!urg_offset)
2413 {2414 if (!sk->urginline)
2415 {2416 ++*seq;
2417 offset++;
2418 used--;
2419 }2420 }2421 else2422 used = urg_offset;
2423 }2424 }2425
2426 /*2427 * Copy it - We _MUST_ update *seq first so that we2428 * don't ever double read when we have dual readers2429 */2430
2431 *seq += used;
2432
2433 /*2434 * This memcpy_tofs can sleep. If it sleeps and we2435 * do a second read it relies on the skb->users to avoid2436 * a crash when cleanup_rbuf() gets called.2437 */2438
2439 memcpy_toiovec(msg->msg_iov,((unsignedchar *)skb->h.th) +
2440 skb->h.th->doff*4 + offset, used);
2441 copied += used;
2442 len -= used;
2443
2444 /*2445 * We now will not sleep again until we are finished2446 * with skb. Sorry if you are doing the SMP port2447 * but you'll just have to fix it neatly ;)2448 */2449
2450 skb->users --;
2451
2452 if (after(sk->copied_seq,sk->urg_seq))
2453 sk->urg_data = 0;
2454 if (used + offset < skb->len)
2455 continue;
2456
2457 /*2458 * Process the FIN.2459 */2460
2461 if (skb->h.th->fin)
2462 gotofound_fin_ok;
2463 if (flags & MSG_PEEK)
2464 continue;
2465 skb->used = 1;
2466 continue;
2467
2468 found_fin_ok:
2469 ++*seq;
2470 if (flags & MSG_PEEK)
2471 break;
2472
2473 /*2474 * All is done2475 */2476
2477 skb->used = 1;
2478 sk->shutdown |= RCV_SHUTDOWN;
2479 break;
2480
2481 }2482
2483 if(copied>0 && msg->msg_name)
2484 {2485 structsockaddr_in *sin=(structsockaddr_in *)msg->msg_name;
2486 sin->sin_family=AF_INET;
2487 sin->sin_addr.s_addr=sk->daddr;
2488 sin->sin_port=sk->dummy_th.dest;
2489 }2490 if(addr_len)
2491 *addr_len=sizeof(structsockaddr_in);
2492
2493 remove_wait_queue(sk->sleep, &wait);
2494 current->state = TASK_RUNNING;
2495
2496 /* Clean up data we have read: This will do ACK frames */2497 cleanup_rbuf(sk);
2498 release_sock(sk);
2499 returncopied;
2500 }2501
2502
2503 staticinttcp_recvfrom(structsock *sk, unsignedchar *ubuf, intsize, intnoblock, unsignedflags,
/* */2504 structsockaddr_in *sa, int *addr_len)
2505 {2506 structioveciov;
2507 structmsghdrmsg;
2508
2509 iov.iov_base = (void *)ubuf;
2510 iov.iov_len = size;
2511
2512 msg.msg_name = (void *)sa;
2513 msg.msg_namelen = 0;
2514 if (addr_len)
2515 msg.msg_namelen = *addr_len;
2516 msg.msg_accrights = NULL;
2517 msg.msg_iov = &iov;
2518 msg.msg_iovlen = 1;
2519
2520 returntcp_recvmsg(sk, &msg, size, noblock, flags, addr_len);
2521 }2522
2523 inttcp_read(structsock *sk, unsignedchar *buff, intlen, intnoblock,
/* */2524 unsignedflags)
2525 {2526 return(tcp_recvfrom(sk, buff, len, noblock, flags, NULL, NULL));
2527 }2528
2529
2530 /*2531 * State processing on a close. This implements the state shift for2532 * sending our FIN frame. Note that we only send a FIN for some 2533 * states. A shutdown() may have already sent the FIN, or we may be2534 * closed.2535 */2536
2537 staticinttcp_close_state(structsock *sk, intdead)
/* */2538 {2539 intns=TCP_CLOSE;
2540 intsend_fin=0;
2541 switch(sk->state)
2542 {2543 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2544 break;
2545 caseTCP_SYN_RECV:
2546 caseTCP_ESTABLISHED: /* Closedown begin */2547 ns=TCP_FIN_WAIT1;
2548 send_fin=1;
2549 break;
2550 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2551 caseTCP_FIN_WAIT2:
2552 caseTCP_CLOSING:
2553 ns=sk->state;
2554 break;
2555 caseTCP_CLOSE:
2556 caseTCP_LISTEN:
2557 break;
2558 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2559 wait only for the ACK */2560 ns=TCP_LAST_ACK;
2561 send_fin=1;
2562 }2563
2564 tcp_set_state(sk,ns);
2565
2566 /*2567 * This is a (useful) BSD violating of the RFC. There is a2568 * problem with TCP as specified in that the other end could2569 * keep a socket open forever with no application left this end.2570 * We use a 3 minute timeout (about the same as BSD) then kill2571 * our end. If they send after that then tough - BUT: long enough2572 * that we won't make the old 4*rto = almost no time - whoops2573 * reset mistake.2574 */2575 if(dead && ns==TCP_FIN_WAIT2)
2576 {2577 inttimer_active=del_timer(&sk->timer);
2578 if(timer_active)
2579 add_timer(&sk->timer);
2580 else2581 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2582 }2583
2584 returnsend_fin;
2585 }2586
2587 /*2588 * Send a fin.2589 */2590
2591 staticvoidtcp_send_fin(structsock *sk)
/* */2592 {2593 structproto *prot =(structproto *)sk->prot;
2594 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2595 structtcphdr *t1;
2596 structsk_buff *buff;
2597 structdevice *dev=NULL;
2598 inttmp;
2599
2600 release_sock(sk); /* in case the malloc sleeps. */2601
2602 buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2603 sk->inuse = 1;
2604
2605 if (buff == NULL)
2606 {2607 /* This is a disaster if it occurs */2608 printk("tcp_send_fin: Impossible malloc failure");
2609 return;
2610 }2611
2612 /*2613 * Administrivia2614 */2615
2616 buff->sk = sk;
2617 buff->localroute = sk->localroute;
2618
2619 /*2620 * Put in the IP header and routing stuff. 2621 */2622
2623 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2624 IPPROTO_TCP, sk->opt,
2625 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2626 if (tmp < 0)
2627 {2628 intt;
2629 /*2630 * Finish anyway, treat this as a send that got lost. 2631 * (Not good).2632 */2633
2634 buff->free = 1;
2635 sock_wfree(sk,buff);
2636 sk->write_seq++;
2637 t=del_timer(&sk->timer);
2638 if(t)
2639 add_timer(&sk->timer);
2640 else2641 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2642 return;
2643 }2644
2645 /*2646 * We ought to check if the end of the queue is a buffer and2647 * if so simply add the fin to that buffer, not send it ahead.2648 */2649
2650 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2651 buff->dev = dev;
2652 memcpy(t1, th, sizeof(*t1));
2653 t1->seq = ntohl(sk->write_seq);
2654 sk->write_seq++;
2655 buff->h.seq = sk->write_seq;
2656 t1->ack = 1;
2657 t1->ack_seq = ntohl(sk->acked_seq);
2658 t1->window = ntohs(sk->window=tcp_select_window(sk));
2659 t1->fin = 1;
2660 t1->rst = 0;
2661 t1->doff = sizeof(*t1)/4;
2662 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2663
2664 /*2665 * If there is data in the write queue, the fin must be appended to2666 * the write queue.2667 */2668
2669 if (skb_peek(&sk->write_queue) != NULL)
2670 {2671 buff->free = 0;
2672 if (buff->next != NULL)
2673 {2674 printk("tcp_send_fin: next != NULL\n");
2675 skb_unlink(buff);
2676 }2677 skb_queue_tail(&sk->write_queue, buff);
2678 }2679 else2680 {2681 sk->sent_seq = sk->write_seq;
2682 sk->prot->queue_xmit(sk, dev, buff, 0);
2683 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2684 }2685 }2686
2687 /*2688 * Shutdown the sending side of a connection. Much like close except2689 * that we don't receive shut down or set sk->dead=1.2690 */2691
2692 voidtcp_shutdown(structsock *sk, inthow)
/* */2693 {2694 /*2695 * We need to grab some memory, and put together a FIN,2696 * and then put it into the queue to be sent.2697 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2698 */2699
2700 if (!(how & SEND_SHUTDOWN))
2701 return;
2702
2703 /*2704 * If we've already sent a FIN, or it's a closed state2705 */2706
2707 if (sk->state == TCP_FIN_WAIT1 ||
2708 sk->state == TCP_FIN_WAIT2 ||
2709 sk->state == TCP_CLOSING ||
2710 sk->state == TCP_LAST_ACK ||
2711 sk->state == TCP_TIME_WAIT ||
2712 sk->state == TCP_CLOSE ||
2713 sk->state == TCP_LISTEN2714 )
2715 {2716 return;
2717 }2718 sk->inuse = 1;
2719
2720 /*2721 * flag that the sender has shutdown2722 */2723
2724 sk->shutdown |= SEND_SHUTDOWN;
2725
2726 /*2727 * Clear out any half completed packets. 2728 */2729
2730 if (sk->partial)
2731 tcp_send_partial(sk);
2732
2733 /*2734 * FIN if needed2735 */2736
2737 if(tcp_close_state(sk,0))
2738 tcp_send_fin(sk);
2739
2740 release_sock(sk);
2741 }2742
2743 /*2744 * This routine will send an RST to the other tcp. 2745 */2746
2747 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2748 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2749 {2750 structsk_buff *buff;
2751 structtcphdr *t1;
2752 inttmp;
2753 structdevice *ndev=NULL;
2754
2755 /*2756 * Cannot reset a reset (Think about it).2757 */2758
2759 if(th->rst)
2760 return;
2761
2762 /*2763 * We need to grab some memory, and put together an RST,2764 * and then put it into the queue to be sent.2765 */2766
2767 buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2768 if (buff == NULL)
2769 return;
2770
2771 buff->sk = NULL;
2772 buff->dev = dev;
2773 buff->localroute = 0;
2774
2775 /*2776 * Put in the IP header and routing stuff. 2777 */2778
2779 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2780 sizeof(structtcphdr),tos,ttl);
2781 if (tmp < 0)
2782 {2783 buff->free = 1;
2784 sock_wfree(NULL, buff);
2785 return;
2786 }2787
2788 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
2789 memcpy(t1, th, sizeof(*t1));
2790
2791 /*2792 * Swap the send and the receive. 2793 */2794
2795 t1->dest = th->source;
2796 t1->source = th->dest;
2797 t1->rst = 1;
2798 t1->window = 0;
2799
2800 if(th->ack)
2801 {2802 t1->ack = 0;
2803 t1->seq = th->ack_seq;
2804 t1->ack_seq = 0;
2805 }2806 else2807 {2808 t1->ack = 1;
2809 if(!th->syn)
2810 t1->ack_seq=htonl(th->seq);
2811 else2812 t1->ack_seq=htonl(th->seq+1);
2813 t1->seq=0;
2814 }2815
2816 t1->syn = 0;
2817 t1->urg = 0;
2818 t1->fin = 0;
2819 t1->psh = 0;
2820 t1->doff = sizeof(*t1)/4;
2821 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2822 prot->queue_xmit(NULL, ndev, buff, 1);
2823 tcp_statistics.TcpOutSegs++;
2824 }2825
2826
2827 /*2828 * Look for tcp options. Parses everything but only knows about MSS.2829 * This routine is always called with the packet containing the SYN.2830 * However it may also be called with the ack to the SYN. So you2831 * can't assume this is always the SYN. It's always called after2832 * we have set up sk->mtu to our own MTU.2833 *2834 * We need at minimum to add PAWS support here. Possibly large windows2835 * as Linux gets deployed on 100Mb/sec networks.2836 */2837
2838 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2839 {2840 unsignedchar *ptr;
2841 intlength=(th->doff*4)-sizeof(structtcphdr);
2842 intmss_seen = 0;
2843
2844 ptr = (unsignedchar *)(th + 1);
2845
2846 while(length>0)
2847 {2848 intopcode=*ptr++;
2849 intopsize=*ptr++;
2850 switch(opcode)
2851 {2852 caseTCPOPT_EOL:
2853 return;
2854 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2855 length--;
2856 ptr--; /* the opsize=*ptr++ above was a mistake */2857 continue;
2858
2859 default:
2860 if(opsize<=2) /* Avoid silly options looping forever */2861 return;
2862 switch(opcode)
2863 {2864 caseTCPOPT_MSS:
2865 if(opsize==4 && th->syn)
2866 {2867 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2868 mss_seen = 1;
2869 }2870 break;
2871 /* Add other options here as people feel the urge to implement stuff like large windows */2872 }2873 ptr+=opsize-2;
2874 length-=opsize;
2875 }2876 }2877 if (th->syn)
2878 {2879 if (! mss_seen)
2880 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2881 }2882 #ifdefCONFIG_INET_PCTCP2883 sk->mss = min(sk->max_window >> 1, sk->mtu);
2884 #else2885 sk->mss = min(sk->max_window, sk->mtu);
2886 #endif2887 }2888
2889 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2890 {2891 dst = ntohl(dst);
2892 if (IN_CLASSA(dst))
2893 returnhtonl(IN_CLASSA_NET);
2894 if (IN_CLASSB(dst))
2895 returnhtonl(IN_CLASSB_NET);
2896 returnhtonl(IN_CLASSC_NET);
2897 }2898
2899 /*2900 * Default sequence number picking algorithm.2901 * As close as possible to RFC 793, which2902 * suggests using a 250kHz clock.2903 * Further reading shows this assumes 2MB/s networks.2904 * For 10MB/s ethernet, a 1MHz clock is appropriate.2905 * That's funny, Linux has one built in! Use it!2906 */2907
2908 externinlineu32tcp_init_seq(void)
/* */2909 {2910 structtimevaltv;
2911 do_gettimeofday(&tv);
2912 returntv.tv_usec+tv.tv_sec*1000000;
2913 }2914
2915 /*2916 * This routine handles a connection request.2917 * It should make sure we haven't already responded.2918 * Because of the way BSD works, we have to send a syn/ack now.2919 * This also means it will be harder to close a socket which is2920 * listening.2921 */2922
2923 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2924 unsignedlongdaddr, unsignedlongsaddr,
2925 structoptions *opt, structdevice *dev, u32seq)
2926 {2927 structsk_buff *buff;
2928 structtcphdr *t1;
2929 unsignedchar *ptr;
2930 structsock *newsk;
2931 structtcphdr *th;
2932 structdevice *ndev=NULL;
2933 inttmp;
2934 structrtable *rt;
2935
2936 th = skb->h.th;
2937
2938 /* If the socket is dead, don't accept the connection. */2939 if (!sk->dead)
2940 {2941 sk->data_ready(sk,0);
2942 }2943 else2944 {2945 if(sk->debug)
2946 printk("Reset on %p: Connect on dead socket.\n",sk);
2947 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2948 tcp_statistics.TcpAttemptFails++;
2949 kfree_skb(skb, FREE_READ);
2950 return;
2951 }2952
2953 /*2954 * Make sure we can accept more. This will prevent a2955 * flurry of syns from eating up all our memory.2956 */2957
2958 if (sk->ack_backlog >= sk->max_ack_backlog)
2959 {2960 tcp_statistics.TcpAttemptFails++;
2961 kfree_skb(skb, FREE_READ);
2962 return;
2963 }2964
2965 /*2966 * We need to build a new sock struct.2967 * It is sort of bad to have a socket without an inode attached2968 * to it, but the wake_up's will just wake up the listening socket,2969 * and if the listening socket is destroyed before this is taken2970 * off of the queue, this will take care of it.2971 */2972
2973 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2974 if (newsk == NULL)
2975 {2976 /* just ignore the syn. It will get retransmitted. */2977 tcp_statistics.TcpAttemptFails++;
2978 kfree_skb(skb, FREE_READ);
2979 return;
2980 }2981
2982 memcpy(newsk, sk, sizeof(*newsk));
2983 newsk->opt = NULL;
2984 if (opt && opt->optlen) {2985 sk->opt = (structoptions*)kmalloc(sizeof(structoptions)+opt->optlen, GFP_ATOMIC);
2986 if (!sk->opt) {2987 kfree_s(newsk, sizeof(structsock));
2988 tcp_statistics.TcpAttemptFails++;
2989 kfree_skb(skb, FREE_READ);
2990 return;
2991 }2992 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {2993 kfree_s(sk->opt, sizeof(structoptions)+opt->optlen);
2994 kfree_s(newsk, sizeof(structsock));
2995 tcp_statistics.TcpAttemptFails++;
2996 kfree_skb(skb, FREE_READ);
2997 return;
2998 }2999 }3000 skb_queue_head_init(&newsk->write_queue);
3001 skb_queue_head_init(&newsk->receive_queue);
3002 newsk->send_head = NULL;
3003 newsk->send_tail = NULL;
3004 skb_queue_head_init(&newsk->back_log);
3005 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/3006 newsk->rto = TCP_TIMEOUT_INIT;
3007 newsk->mdev = 0;
3008 newsk->max_window = 0;
3009 newsk->cong_window = 1;
3010 newsk->cong_count = 0;
3011 newsk->ssthresh = 0;
3012 newsk->backoff = 0;
3013 newsk->blog = 0;
3014 newsk->intr = 0;
3015 newsk->proc = 0;
3016 newsk->done = 0;
3017 newsk->partial = NULL;
3018 newsk->pair = NULL;
3019 newsk->wmem_alloc = 0;
3020 newsk->rmem_alloc = 0;
3021 newsk->localroute = sk->localroute;
3022
3023 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3024
3025 newsk->err = 0;
3026 newsk->shutdown = 0;
3027 newsk->ack_backlog = 0;
3028 newsk->acked_seq = skb->h.th->seq+1;
3029 newsk->copied_seq = skb->h.th->seq+1;
3030 newsk->fin_seq = skb->h.th->seq;
3031 newsk->state = TCP_SYN_RECV;
3032 newsk->timeout = 0;
3033 newsk->ip_xmit_timeout = 0;
3034 newsk->write_seq = seq;
3035 newsk->window_seq = newsk->write_seq;
3036 newsk->rcv_ack_seq = newsk->write_seq;
3037 newsk->urg_data = 0;
3038 newsk->retransmits = 0;
3039 newsk->linger=0;
3040 newsk->destroy = 0;
3041 init_timer(&newsk->timer);
3042 newsk->timer.data = (unsignedlong)newsk;
3043 newsk->timer.function = &net_timer;
3044 init_timer(&newsk->retransmit_timer);
3045 newsk->retransmit_timer.data = (unsignedlong)newsk;
3046 newsk->retransmit_timer.function=&retransmit_timer;
3047 newsk->dummy_th.source = skb->h.th->dest;
3048 newsk->dummy_th.dest = skb->h.th->source;
3049
3050 /*3051 * Swap these two, they are from our point of view. 3052 */3053
3054 newsk->daddr = saddr;
3055 newsk->saddr = daddr;
3056
3057 put_sock(newsk->num,newsk);
3058 newsk->dummy_th.res1 = 0;
3059 newsk->dummy_th.doff = 6;
3060 newsk->dummy_th.fin = 0;
3061 newsk->dummy_th.syn = 0;
3062 newsk->dummy_th.rst = 0;
3063 newsk->dummy_th.psh = 0;
3064 newsk->dummy_th.ack = 0;
3065 newsk->dummy_th.urg = 0;
3066 newsk->dummy_th.res2 = 0;
3067 newsk->acked_seq = skb->h.th->seq + 1;
3068 newsk->copied_seq = skb->h.th->seq + 1;
3069 newsk->socket = NULL;
3070
3071 /*3072 * Grab the ttl and tos values and use them 3073 */3074
3075 newsk->ip_ttl=sk->ip_ttl;
3076 newsk->ip_tos=skb->ip_hdr->tos;
3077
3078 /*3079 * Use 512 or whatever user asked for 3080 */3081
3082 /*3083 * Note use of sk->user_mss, since user has no direct access to newsk 3084 */3085
3086 rt=ip_rt_route(saddr, NULL,NULL);
3087
3088 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3089 newsk->window_clamp = rt->rt_window;
3090 else3091 newsk->window_clamp = 0;
3092
3093 if (sk->user_mss)
3094 newsk->mtu = sk->user_mss;
3095 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
3096 newsk->mtu = rt->rt_mss - sizeof(structiphdr) - sizeof(structtcphdr);
3097 else3098 {3099 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */3100 if ((saddr ^ daddr) & default_mask(saddr))
3101 #else3102 if ((saddr ^ daddr) & dev->pa_mask)
3103 #endif3104 newsk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
3105 else3106 newsk->mtu = MAX_WINDOW;
3107 }3108
3109 /*3110 * But not bigger than device MTU 3111 */3112
3113 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
3114
3115 /*3116 * This will min with what arrived in the packet 3117 */3118
3119 tcp_options(newsk,skb->h.th);
3120
3121 tcp_cache_zap();
3122
3123 buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3124 if (buff == NULL)
3125 {3126 sk->err = ENOMEM;
3127 newsk->dead = 1;
3128 newsk->state = TCP_CLOSE;
3129 /* And this will destroy it */3130 release_sock(newsk);
3131 kfree_skb(skb, FREE_READ);
3132 tcp_statistics.TcpAttemptFails++;
3133 return;
3134 }3135
3136 buff->sk = newsk;
3137 buff->localroute = newsk->localroute;
3138
3139 /*3140 * Put in the IP header and routing stuff. 3141 */3142
3143 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3144 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3145
3146 /*3147 * Something went wrong. 3148 */3149
3150 if (tmp < 0)
3151 {3152 sk->err = tmp;
3153 buff->free = 1;
3154 kfree_skb(buff,FREE_WRITE);
3155 newsk->dead = 1;
3156 newsk->state = TCP_CLOSE;
3157 release_sock(newsk);
3158 skb->sk = sk;
3159 kfree_skb(skb, FREE_READ);
3160 tcp_statistics.TcpAttemptFails++;
3161 return;
3162 }3163
3164 t1 =(structtcphdr *)skb_put(buff,sizeof(structtcphdr));
3165
3166 memcpy(t1, skb->h.th, sizeof(*t1));
3167 buff->h.seq = newsk->write_seq;
3168 /*3169 * Swap the send and the receive. 3170 */3171 t1->dest = skb->h.th->source;
3172 t1->source = newsk->dummy_th.source;
3173 t1->seq = ntohl(newsk->write_seq++);
3174 t1->ack = 1;
3175 newsk->window = tcp_select_window(newsk);
3176 newsk->sent_seq = newsk->write_seq;
3177 t1->window = ntohs(newsk->window);
3178 t1->res1 = 0;
3179 t1->res2 = 0;
3180 t1->rst = 0;
3181 t1->urg = 0;
3182 t1->psh = 0;
3183 t1->syn = 1;
3184 t1->ack_seq = ntohl(skb->h.th->seq+1);
3185 t1->doff = sizeof(*t1)/4+1;
3186 ptr = skb_put(buff,4);
3187 ptr[0] = 2;
3188 ptr[1] = 4;
3189 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3190 ptr[3] =(newsk->mtu) & 0xff;
3191
3192 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3193 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3194 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3195 skb->sk = newsk;
3196
3197 /*3198 * Charge the sock_buff to newsk. 3199 */3200
3201 sk->rmem_alloc -= skb->truesize;
3202 newsk->rmem_alloc += skb->truesize;
3203
3204 skb_queue_tail(&sk->receive_queue,skb);
3205 sk->ack_backlog++;
3206 release_sock(newsk);
3207 tcp_statistics.TcpOutSegs++;
3208 }3209
3210
3211 staticvoidtcp_close(structsock *sk, inttimeout)
/* */3212 {3213 /*3214 * We need to grab some memory, and put together a FIN, 3215 * and then put it into the queue to be sent.3216 */3217
3218 sk->inuse = 1;
3219
3220 if(th_cache_sk==sk)
3221 tcp_cache_zap();
3222 if(sk->state == TCP_LISTEN)
3223 {3224 /* Special case */3225 tcp_set_state(sk, TCP_CLOSE);
3226 tcp_close_pending(sk);
3227 release_sock(sk);
3228 return;
3229 }3230
3231 sk->keepopen = 1;
3232 sk->shutdown = SHUTDOWN_MASK;
3233
3234 if (!sk->dead)
3235 sk->state_change(sk);
3236
3237 if (timeout == 0)
3238 {3239 structsk_buff *skb;
3240
3241 /*3242 * We need to flush the recv. buffs. We do this only on the3243 * descriptor close, not protocol-sourced closes, because the3244 * reader process may not have drained the data yet!3245 */3246
3247 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3248 kfree_skb(skb, FREE_READ);
3249 /*3250 * Get rid off any half-completed packets. 3251 */3252
3253 if (sk->partial)
3254 tcp_send_partial(sk);
3255 }3256
3257
3258 /*3259 * Timeout is not the same thing - however the code likes3260 * to send both the same way (sigh).3261 */3262
3263 if(timeout)
3264 {3265 tcp_set_state(sk, TCP_CLOSE); /* Dead */3266 }3267 else3268 {3269 if(tcp_close_state(sk,1)==1)
3270 {3271 tcp_send_fin(sk);
3272 }3273 }3274 release_sock(sk);
3275 }3276
3277
3278 /*3279 * This routine takes stuff off of the write queue,3280 * and puts it in the xmit queue. This happens as incoming acks3281 * open up the remote window for us.3282 */3283
3284 staticvoidtcp_write_xmit(structsock *sk)
/* */3285 {3286 structsk_buff *skb;
3287
3288 /*3289 * The bytes will have to remain here. In time closedown will3290 * empty the write queue and all will be happy 3291 */3292
3293 if(sk->zapped)
3294 return;
3295
3296 /*3297 * Anything on the transmit queue that fits the window can3298 * be added providing we are not3299 *3300 * a) retransmitting (Nagle's rule)3301 * b) exceeding our congestion window.3302 */3303
3304 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3305 before(skb->h.seq, sk->window_seq + 1) &&
3306 (sk->retransmits == 0 ||
3307 sk->ip_xmit_timeout != TIME_WRITE ||
3308 before(skb->h.seq, sk->rcv_ack_seq + 1))
3309 && sk->packets_out < sk->cong_window)
3310 {3311 IS_SKB(skb);
3312 skb_unlink(skb);
3313
3314 /*3315 * See if we really need to send the packet. 3316 */3317
3318 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3319 {3320 /*3321 * This is acked data. We can discard it. This 3322 * cannot currently occur.3323 */3324
3325 sk->retransmits = 0;
3326 kfree_skb(skb, FREE_WRITE);
3327 if (!sk->dead)
3328 sk->write_space(sk);
3329 }3330 else3331 {3332 structtcphdr *th;
3333 structiphdr *iph;
3334 intsize;
3335 /*3336 * put in the ack seq and window at this point rather than earlier,3337 * in order to keep them monotonic. We really want to avoid taking3338 * back window allocations. That's legal, but RFC1122 says it's frowned on.3339 * Ack and window will in general have changed since this packet was put3340 * on the write queue.3341 */3342 iph = skb->ip_hdr;
3343 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3344 size = skb->len - (((unsignedchar *) th) - skb->data);
3345
3346 th->ack_seq = ntohl(sk->acked_seq);
3347 th->window = ntohs(tcp_select_window(sk));
3348
3349 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3350
3351 sk->sent_seq = skb->h.seq;
3352
3353 /*3354 * IP manages our queue for some crazy reason3355 */3356
3357 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3358
3359 /*3360 * Again we slide the timer wrongly3361 */3362
3363 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3364 }3365 }3366 }3367
3368
3369 /*3370 * This routine deals with incoming acks, but not outgoing ones.3371 */3372
3373 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3374 {3375 u32ack;
3376 intflag = 0;
3377
3378 /* 3379 * 1 - there was data in packet as well as ack or new data is sent or 3380 * in shutdown state3381 * 2 - data from retransmit queue was acked and removed3382 * 4 - window shrunk or data from retransmit queue was acked and removed3383 */3384
3385 if(sk->zapped)
3386 return(1); /* Dead, cant ack any more so why bother */3387
3388 /*3389 * Have we discovered a larger window3390 */3391
3392 ack = ntohl(th->ack_seq);
3393
3394 if (ntohs(th->window) > sk->max_window)
3395 {3396 sk->max_window = ntohs(th->window);
3397 #ifdefCONFIG_INET_PCTCP3398 /* Hack because we don't send partial packets to non SWS3399 handling hosts */3400 sk->mss = min(sk->max_window>>1, sk->mtu);
3401 #else3402 sk->mss = min(sk->max_window, sk->mtu);
3403 #endif3404 }3405
3406 /*3407 * We have dropped back to keepalive timeouts. Thus we have3408 * no retransmits pending.3409 */3410
3411 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3412 sk->retransmits = 0;
3413
3414 /*3415 * If the ack is newer than sent or older than previous acks3416 * then we can probably ignore it.3417 */3418
3419 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3420 {3421 if(sk->debug)
3422 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3423
3424 /*3425 * Keepalive processing.3426 */3427
3428 if (after(ack, sk->sent_seq))
3429 {3430 return(0);
3431 }3432
3433 /*3434 * Restart the keepalive timer.3435 */3436
3437 if (sk->keepopen)
3438 {3439 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3440 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3441 }3442 return(1);
3443 }3444
3445 /*3446 * If there is data set flag 13447 */3448
3449 if (len != th->doff*4)
3450 flag |= 1;
3451
3452 /*3453 * See if our window has been shrunk. 3454 */3455
3456 if (after(sk->window_seq, ack+ntohs(th->window)))
3457 {3458 /*3459 * We may need to move packets from the send queue3460 * to the write queue, if the window has been shrunk on us.3461 * The RFC says you are not allowed to shrink your window3462 * like this, but if the other end does, you must be able3463 * to deal with it.3464 */3465 structsk_buff *skb;
3466 structsk_buff *skb2;
3467 structsk_buff *wskb = NULL;
3468
3469 skb2 = sk->send_head;
3470 sk->send_head = NULL;
3471 sk->send_tail = NULL;
3472
3473 /*3474 * This is an artifact of a flawed concept. We want one3475 * queue and a smarter send routine when we send all.3476 */3477
3478 flag |= 4; /* Window changed */3479
3480 sk->window_seq = ack + ntohs(th->window);
3481 cli();
3482 while (skb2 != NULL)
3483 {3484 skb = skb2;
3485 skb2 = skb->link3;
3486 skb->link3 = NULL;
3487 if (after(skb->h.seq, sk->window_seq))
3488 {3489 if (sk->packets_out > 0)
3490 sk->packets_out--;
3491 /* We may need to remove this from the dev send list. */3492 if (skb->next != NULL)
3493 {3494 skb_unlink(skb);
3495 }3496 /* Now add it to the write_queue. */3497 if (wskb == NULL)
3498 skb_queue_head(&sk->write_queue,skb);
3499 else3500 skb_append(wskb,skb);
3501 wskb = skb;
3502 }3503 else3504 {3505 if (sk->send_head == NULL)
3506 {3507 sk->send_head = skb;
3508 sk->send_tail = skb;
3509 }3510 else3511 {3512 sk->send_tail->link3 = skb;
3513 sk->send_tail = skb;
3514 }3515 skb->link3 = NULL;
3516 }3517 }3518 sti();
3519 }3520
3521 /*3522 * Pipe has emptied3523 */3524
3525 if (sk->send_tail == NULL || sk->send_head == NULL)
3526 {3527 sk->send_head = NULL;
3528 sk->send_tail = NULL;
3529 sk->packets_out= 0;
3530 }3531
3532 /*3533 * Update the right hand window edge of the host3534 */3535
3536 sk->window_seq = ack + ntohs(th->window);
3537
3538 /*3539 * We don't want too many packets out there. 3540 */3541
3542 if (sk->ip_xmit_timeout == TIME_WRITE &&
3543 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3544 {3545 /* 3546 * This is Jacobson's slow start and congestion avoidance. 3547 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3548 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3549 * counter and increment it once every cwnd times. It's possible3550 * that this should be done only if sk->retransmits == 0. I'm3551 * interpreting "new data is acked" as including data that has3552 * been retransmitted but is just now being acked.3553 */3554 if (sk->cong_window < sk->ssthresh)
3555 /* 3556 * In "safe" area, increase3557 */3558 sk->cong_window++;
3559 else3560 {3561 /*3562 * In dangerous area, increase slowly. In theory this is3563 * sk->cong_window += 1 / sk->cong_window3564 */3565 if (sk->cong_count >= sk->cong_window)
3566 {3567 sk->cong_window++;
3568 sk->cong_count = 0;
3569 }3570 else3571 sk->cong_count++;
3572 }3573 }3574
3575 /*3576 * Remember the highest ack received.3577 */3578
3579 sk->rcv_ack_seq = ack;
3580
3581 /*3582 * If this ack opens up a zero window, clear backoff. It was3583 * being used to time the probes, and is probably far higher than3584 * it needs to be for normal retransmission.3585 */3586
3587 if (sk->ip_xmit_timeout == TIME_PROBE0)
3588 {3589 sk->retransmits = 0; /* Our probe was answered */3590
3591 /*3592 * Was it a usable window open ?3593 */3594
3595 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3596 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3597 {3598 sk->backoff = 0;
3599
3600 /*3601 * Recompute rto from rtt. this eliminates any backoff.3602 */3603
3604 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3605 if (sk->rto > 120*HZ)
3606 sk->rto = 120*HZ;
3607 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3608 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3609 .2 of a second is going to need huge windows (SIGH) */3610 sk->rto = 20;
3611 }3612 }3613
3614 /* 3615 * See if we can take anything off of the retransmit queue.3616 */3617
3618 while(sk->send_head != NULL)
3619 {3620 /* Check for a bug. */3621 if (sk->send_head->link3 &&
3622 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3623 printk("INET: tcp.c: *** bug send_list out of order.\n");
3624
3625 /*3626 * If our packet is before the ack sequence we can3627 * discard it as it's confirmed to have arrived the other end.3628 */3629
3630 if (before(sk->send_head->h.seq, ack+1))
3631 {3632 structsk_buff *oskb;
3633 if (sk->retransmits)
3634 {3635 /*3636 * We were retransmitting. don't count this in RTT est 3637 */3638 flag |= 2;
3639
3640 /*3641 * even though we've gotten an ack, we're still3642 * retransmitting as long as we're sending from3643 * the retransmit queue. Keeping retransmits non-zero3644 * prevents us from getting new data interspersed with3645 * retransmissions.3646 */3647
3648 if (sk->send_head->link3) /* Any more queued retransmits? */3649 sk->retransmits = 1;
3650 else3651 sk->retransmits = 0;
3652 }3653 /*3654 * Note that we only reset backoff and rto in the3655 * rtt recomputation code. And that doesn't happen3656 * if there were retransmissions in effect. So the3657 * first new packet after the retransmissions is3658 * sent with the backoff still in effect. Not until3659 * we get an ack from a non-retransmitted packet do3660 * we reset the backoff and rto. This allows us to deal3661 * with a situation where the network delay has increased3662 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3663 */3664
3665 /*3666 * We have one less packet out there. 3667 */3668
3669 if (sk->packets_out > 0)
3670 sk->packets_out --;
3671 /* 3672 * Wake up the process, it can probably write more. 3673 */3674 if (!sk->dead)
3675 sk->write_space(sk);
3676 oskb = sk->send_head;
3677
3678 if (!(flag&2)) /* Not retransmitting */3679 {3680 longm;
3681
3682 /*3683 * The following amusing code comes from Jacobson's3684 * article in SIGCOMM '88. Note that rtt and mdev3685 * are scaled versions of rtt and mean deviation.3686 * This is designed to be as fast as possible 3687 * m stands for "measurement".3688 */3689
3690 m = jiffies - oskb->when; /* RTT */3691 if(m<=0)
3692 m=1; /* IS THIS RIGHT FOR <0 ??? */3693 m -= (sk->rtt >> 3); /* m is now error in rtt est */3694 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3695 if (m < 0)
3696 m = -m; /* m is now abs(error) */3697 m -= (sk->mdev >> 2); /* similar update on mdev */3698 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3699
3700 /*3701 * Now update timeout. Note that this removes any backoff.3702 */3703
3704 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3705 if (sk->rto > 120*HZ)
3706 sk->rto = 120*HZ;
3707 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3708 sk->rto = 20;
3709 sk->backoff = 0;
3710 }3711 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3712 In this case as we just set it up */3713 cli();
3714 oskb = sk->send_head;
3715 IS_SKB(oskb);
3716 sk->send_head = oskb->link3;
3717 if (sk->send_head == NULL)
3718 {3719 sk->send_tail = NULL;
3720 }3721
3722 /*3723 * We may need to remove this from the dev send list. 3724 */3725
3726 if (oskb->next)
3727 skb_unlink(oskb);
3728 sti();
3729 kfree_skb(oskb, FREE_WRITE); /* write. */3730 if (!sk->dead)
3731 sk->write_space(sk);
3732 }3733 else3734 {3735 break;
3736 }3737 }3738
3739 /*3740 * XXX someone ought to look at this too.. at the moment, if skb_peek()3741 * returns non-NULL, we complete ignore the timer stuff in the else3742 * clause. We ought to organize the code so that else clause can3743 * (should) be executed regardless, possibly moving the PROBE timer3744 * reset over. The skb_peek() thing should only move stuff to the3745 * write queue, NOT also manage the timer functions.3746 */3747
3748 /*3749 * Maybe we can take some stuff off of the write queue,3750 * and put it onto the xmit queue.3751 */3752 if (skb_peek(&sk->write_queue) != NULL)
3753 {3754 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3755 (sk->retransmits == 0 ||
3756 sk->ip_xmit_timeout != TIME_WRITE ||
3757 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3758 && sk->packets_out < sk->cong_window)
3759 {3760 /*3761 * Add more data to the send queue.3762 */3763 flag |= 1;
3764 tcp_write_xmit(sk);
3765 }3766 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3767 sk->send_head == NULL &&
3768 sk->ack_backlog == 0 &&
3769 sk->state != TCP_TIME_WAIT)
3770 {3771 /*3772 * Data to queue but no room.3773 */3774 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3775 }3776 }3777 else3778 {3779 /*3780 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3781 * from TCP_CLOSE we don't do anything3782 *3783 * from anything else, if there is write data (or fin) pending,3784 * we use a TIME_WRITE timeout, else if keepalive we reset to3785 * a KEEPALIVE timeout, else we delete the timer.3786 *3787 * We do not set flag for nominal write data, otherwise we may3788 * force a state where we start to write itsy bitsy tidbits3789 * of data.3790 */3791
3792 switch(sk->state) {3793 caseTCP_TIME_WAIT:
3794 /*3795 * keep us in TIME_WAIT until we stop getting packets,3796 * reset the timeout.3797 */3798 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3799 break;
3800 caseTCP_CLOSE:
3801 /*3802 * don't touch the timer.3803 */3804 break;
3805 default:
3806 /*3807 * Must check send_head, write_queue, and ack_backlog3808 * to determine which timeout to use.3809 */3810 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3811 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3812 }elseif (sk->keepopen) {3813 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3814 }else{3815 del_timer(&sk->retransmit_timer);
3816 sk->ip_xmit_timeout = 0;
3817 }3818 break;
3819 }3820 }3821
3822 /*3823 * We have nothing queued but space to send. Send any partial3824 * packets immediately (end of Nagle rule application).3825 */3826
3827 if (sk->packets_out == 0 && sk->partial != NULL &&
3828 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3829 {3830 flag |= 1;
3831 tcp_send_partial(sk);
3832 }3833
3834 /*3835 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3836 * we are now waiting for an acknowledge to our FIN. The other end is3837 * already in TIME_WAIT.3838 *3839 * Move to TCP_CLOSE on success.3840 */3841
3842 if (sk->state == TCP_LAST_ACK)
3843 {3844 if (!sk->dead)
3845 sk->state_change(sk);
3846 if(sk->debug)
3847 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3848 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3849 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3850 {3851 flag |= 1;
3852 tcp_set_state(sk,TCP_CLOSE);
3853 sk->shutdown = SHUTDOWN_MASK;
3854 }3855 }3856
3857 /*3858 * Incoming ACK to a FIN we sent in the case of our initiating the close.3859 *3860 * Move to FIN_WAIT2 to await a FIN from the other end. Set3861 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3862 */3863
3864 if (sk->state == TCP_FIN_WAIT1)
3865 {3866
3867 if (!sk->dead)
3868 sk->state_change(sk);
3869 if (sk->rcv_ack_seq == sk->write_seq)
3870 {3871 flag |= 1;
3872 sk->shutdown |= SEND_SHUTDOWN;
3873 tcp_set_state(sk, TCP_FIN_WAIT2);
3874 }3875 }3876
3877 /*3878 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3879 *3880 * Move to TIME_WAIT3881 */3882
3883 if (sk->state == TCP_CLOSING)
3884 {3885
3886 if (!sk->dead)
3887 sk->state_change(sk);
3888 if (sk->rcv_ack_seq == sk->write_seq)
3889 {3890 flag |= 1;
3891 tcp_time_wait(sk);
3892 }3893 }3894
3895 /*3896 * Final ack of a three way shake 3897 */3898
3899 if(sk->state==TCP_SYN_RECV)
3900 {3901 tcp_set_state(sk, TCP_ESTABLISHED);
3902 tcp_options(sk,th);
3903 sk->dummy_th.dest=th->source;
3904 sk->copied_seq = sk->acked_seq;
3905 if(!sk->dead)
3906 sk->state_change(sk);
3907 if(sk->max_window==0)
3908 {3909 sk->max_window=32; /* Sanity check */3910 sk->mss=min(sk->max_window,sk->mtu);
3911 }3912 }3913
3914 /*3915 * I make no guarantees about the first clause in the following3916 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3917 * what conditions "!flag" would be true. However I think the rest3918 * of the conditions would prevent that from causing any3919 * unnecessary retransmission. 3920 * Clearly if the first packet has expired it should be 3921 * retransmitted. The other alternative, "flag&2 && retransmits", is3922 * harder to explain: You have to look carefully at how and when the3923 * timer is set and with what timeout. The most recent transmission always3924 * sets the timer. So in general if the most recent thing has timed3925 * out, everything before it has as well. So we want to go ahead and3926 * retransmit some more. If we didn't explicitly test for this3927 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3928 * would not be true. If you look at the pattern of timing, you can3929 * show that rto is increased fast enough that the next packet would3930 * almost never be retransmitted immediately. Then you'd end up3931 * waiting for a timeout to send each packet on the retransmission3932 * queue. With my implementation of the Karn sampling algorithm,3933 * the timeout would double each time. The net result is that it would3934 * take a hideous amount of time to recover from a single dropped packet.3935 * It's possible that there should also be a test for TIME_WRITE, but3936 * I think as long as "send_head != NULL" and "retransmit" is on, we've3937 * got to be in real retransmission mode.3938 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3939 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3940 * As long as no further losses occur, this seems reasonable.3941 */3942
3943 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3944 (((flag&2) && sk->retransmits) ||
3945 (sk->send_head->when + sk->rto < jiffies)))
3946 {3947 if(sk->send_head->when + sk->rto < jiffies)
3948 tcp_retransmit(sk,0);
3949 else3950 {3951 tcp_do_retransmit(sk, 1);
3952 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3953 }3954 }3955
3956 return(1);
3957 }3958
3959
3960 /*3961 * Process the FIN bit. This now behaves as it is supposed to work3962 * and the FIN takes effect when it is validly part of sequence3963 * space. Not before when we get holes.3964 *3965 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3966 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3967 * TIME-WAIT)3968 *3969 * If we are in FINWAIT-1, a received FIN indicates simultaneous3970 * close and we go into CLOSING (and later onto TIME-WAIT)3971 *3972 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3973 *3974 */3975
3976 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3977 {3978 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3979
3980 if (!sk->dead)
3981 {3982 sk->state_change(sk);
3983 sock_wake_async(sk->socket, 1);
3984 }3985
3986 switch(sk->state)
3987 {3988 caseTCP_SYN_RECV:
3989 caseTCP_SYN_SENT:
3990 caseTCP_ESTABLISHED:
3991 /*3992 * move to CLOSE_WAIT, tcp_data() already handled3993 * sending the ack.3994 */3995 tcp_set_state(sk,TCP_CLOSE_WAIT);
3996 if (th->rst)
3997 sk->shutdown = SHUTDOWN_MASK;
3998 break;
3999
4000 caseTCP_CLOSE_WAIT:
4001 caseTCP_CLOSING:
4002 /*4003 * received a retransmission of the FIN, do4004 * nothing.4005 */4006 break;
4007 caseTCP_TIME_WAIT:
4008 /*4009 * received a retransmission of the FIN,4010 * restart the TIME_WAIT timer.4011 */4012 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4013 return(0);
4014 caseTCP_FIN_WAIT1:
4015 /*4016 * This case occurs when a simultaneous close4017 * happens, we must ack the received FIN and4018 * enter the CLOSING state.4019 *4020 * This causes a WRITE timeout, which will either4021 * move on to TIME_WAIT when we timeout, or resend4022 * the FIN properly (maybe we get rid of that annoying4023 * FIN lost hang). The TIME_WRITE code is already correct4024 * for handling this timeout.4025 */4026
4027 if(sk->ip_xmit_timeout != TIME_WRITE)
4028 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4029 tcp_set_state(sk,TCP_CLOSING);
4030 break;
4031 caseTCP_FIN_WAIT2:
4032 /*4033 * received a FIN -- send ACK and enter TIME_WAIT4034 */4035 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4036 sk->shutdown|=SHUTDOWN_MASK;
4037 tcp_set_state(sk,TCP_TIME_WAIT);
4038 break;
4039 caseTCP_CLOSE:
4040 /*4041 * already in CLOSE4042 */4043 break;
4044 default:
4045 tcp_set_state(sk,TCP_LAST_ACK);
4046
4047 /* Start the timers. */4048 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4049 return(0);
4050 }4051
4052 return(0);
4053 }4054
4055
4056
4057 /*4058 * This routine handles the data. If there is room in the buffer,4059 * it will be have already been moved into it. If there is no4060 * room, then we will just have to discard the packet.4061 */4062
4063 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */4064 unsignedlongsaddr, unsignedshortlen)
4065 {4066 structsk_buff *skb1, *skb2;
4067 structtcphdr *th;
4068 intdup_dumped=0;
4069 u32new_seq, shut_seq;
4070
4071 th = skb->h.th;
4072 skb_pull(skb,th->doff*4);
4073 skb_trim(skb,len-(th->doff*4));
4074
4075 /*4076 * The bytes in the receive read/assembly queue has increased. Needed for the4077 * low memory discard algorithm 4078 */4079
4080 sk->bytes_rcv += skb->len;
4081
4082 if (skb->len == 0 && !th->fin)
4083 {4084 /* 4085 * Don't want to keep passing ack's back and forth. 4086 * (someone sent us dataless, boring frame)4087 */4088 if (!th->ack)
4089 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4090 kfree_skb(skb, FREE_READ);
4091 return(0);
4092 }4093
4094 /*4095 * We no longer have anyone receiving data on this connection.4096 */4097
4098 #ifndef TCP_DONT_RST_SHUTDOWN
4099
4100 if(sk->shutdown & RCV_SHUTDOWN)
4101 {4102 /*4103 * FIXME: BSD has some magic to avoid sending resets to4104 * broken 4.2 BSD keepalives. Much to my surprise a few non4105 * BSD stacks still have broken keepalives so we want to4106 * cope with it.4107 */4108
4109 if(skb->len) /* We don't care if it's just an ack or4110 a keepalive/window probe */4111 {4112 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */4113
4114 /* Do this the way 4.4BSD treats it. Not what I'd4115 regard as the meaning of the spec but it's what BSD4116 does and clearly they know everything 8) */4117
4118 /*4119 * This is valid because of two things4120 *4121 * a) The way tcp_data behaves at the bottom.4122 * b) A fin takes effect when read not when received.4123 */4124
4125 shut_seq=sk->acked_seq+1; /* Last byte */4126
4127 if(after(new_seq,shut_seq))
4128 {4129 if(sk->debug)
4130 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4131 sk, new_seq, shut_seq, sk->blog);
4132 if(sk->dead)
4133 {4134 sk->acked_seq = new_seq + th->fin;
4135 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4136 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4137 tcp_statistics.TcpEstabResets++;
4138 tcp_set_state(sk,TCP_CLOSE);
4139 sk->err = EPIPE;
4140 sk->shutdown = SHUTDOWN_MASK;
4141 kfree_skb(skb, FREE_READ);
4142 return 0;
4143 }4144 }4145 }4146 }4147
4148 #endif4149
4150 /*4151 * Now we have to walk the chain, and figure out where this one4152 * goes into it. This is set up so that the last packet we received4153 * will be the first one we look at, that way if everything comes4154 * in order, there will be no performance loss, and if they come4155 * out of order we will be able to fit things in nicely.4156 *4157 * [AC: This is wrong. We should assume in order first and then walk4158 * forwards from the first hole based upon real traffic patterns.]4159 * 4160 */4161
4162 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */4163 {4164 skb_queue_head(&sk->receive_queue,skb);
4165 skb1= NULL;
4166 }4167 else4168 {4169 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4170 {4171 if(sk->debug)
4172 {4173 printk("skb1=%p :", skb1);
4174 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4175 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4176 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4177 sk->acked_seq);
4178 }4179
4180 /*4181 * Optimisation: Duplicate frame or extension of previous frame from4182 * same sequence point (lost ack case).4183 * The frame contains duplicate data or replaces a previous frame4184 * discard the previous frame (safe as sk->inuse is set) and put4185 * the new one in its place.4186 */4187
4188 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4189 {4190 skb_append(skb1,skb);
4191 skb_unlink(skb1);
4192 kfree_skb(skb1,FREE_READ);
4193 dup_dumped=1;
4194 skb1=NULL;
4195 break;
4196 }4197
4198 /*4199 * Found where it fits4200 */4201
4202 if (after(th->seq+1, skb1->h.th->seq))
4203 {4204 skb_append(skb1,skb);
4205 break;
4206 }4207
4208 /*4209 * See if we've hit the start. If so insert.4210 */4211 if (skb1 == skb_peek(&sk->receive_queue))
4212 {4213 skb_queue_head(&sk->receive_queue, skb);
4214 break;
4215 }4216 }4217 }4218
4219 /*4220 * Figure out what the ack value for this frame is4221 */4222
4223 th->ack_seq = th->seq + skb->len;
4224 if (th->syn)
4225 th->ack_seq++;
4226 if (th->fin)
4227 th->ack_seq++;
4228
4229 if (before(sk->acked_seq, sk->copied_seq))
4230 {4231 printk("*** tcp.c:tcp_data bug acked < copied\n");
4232 sk->acked_seq = sk->copied_seq;
4233 }4234
4235 /*4236 * Now figure out if we can ack anything. This is very messy because we really want two4237 * receive queues, a completed and an assembly queue. We also want only one transmit4238 * queue.4239 */4240
4241 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
4242 {4243 if (before(th->seq, sk->acked_seq+1))
4244 {4245 intnewwindow;
4246
4247 if (after(th->ack_seq, sk->acked_seq))
4248 {4249 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4250 if (newwindow < 0)
4251 newwindow = 0;
4252 sk->window = newwindow;
4253 sk->acked_seq = th->ack_seq;
4254 }4255 skb->acked = 1;
4256
4257 /*4258 * When we ack the fin, we do the FIN 4259 * processing.4260 */4261
4262 if (skb->h.th->fin)
4263 {4264 tcp_fin(skb,sk,skb->h.th);
4265 }4266
4267 for(skb2 = skb->next;
4268 skb2 != (structsk_buff *)&sk->receive_queue;
4269 skb2 = skb2->next)
4270 {4271 if (before(skb2->h.th->seq, sk->acked_seq+1))
4272 {4273 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4274 {4275 newwindow = sk->window -
4276 (skb2->h.th->ack_seq - sk->acked_seq);
4277 if (newwindow < 0)
4278 newwindow = 0;
4279 sk->window = newwindow;
4280 sk->acked_seq = skb2->h.th->ack_seq;
4281 }4282 skb2->acked = 1;
4283 /*4284 * When we ack the fin, we do4285 * the fin handling.4286 */4287 if (skb2->h.th->fin)
4288 {4289 tcp_fin(skb,sk,skb->h.th);
4290 }4291
4292 /*4293 * Force an immediate ack.4294 */4295
4296 sk->ack_backlog = sk->max_ack_backlog;
4297 }4298 else4299 {4300 break;
4301 }4302 }4303
4304 /*4305 * This also takes care of updating the window.4306 * This if statement needs to be simplified.4307 */4308 if (!sk->delay_acks ||
4309 sk->ack_backlog >= sk->max_ack_backlog ||
4310 sk->bytes_rcv > sk->max_unacked || th->fin) {4311 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4312 }4313 else4314 {4315 sk->ack_backlog++;
4316 if(sk->debug)
4317 printk("Ack queued.\n");
4318 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4319 }4320 }4321 }4322
4323 /*4324 * If we've missed a packet, send an ack.4325 * Also start a timer to send another.4326 */4327
4328 if (!skb->acked)
4329 {4330
4331 /*4332 * This is important. If we don't have much room left,4333 * we need to throw out a few packets so we have a good4334 * window. Note that mtu is used, not mss, because mss is really4335 * for the send side. He could be sending us stuff as large as mtu.4336 */4337
4338 while (sock_rspace(sk) < sk->mtu)
4339 {4340 skb1 = skb_peek(&sk->receive_queue);
4341 if (skb1 == NULL)
4342 {4343 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4344 break;
4345 }4346
4347 /*4348 * Don't throw out something that has been acked. 4349 */4350
4351 if (skb1->acked)
4352 {4353 break;
4354 }4355
4356 skb_unlink(skb1);
4357 kfree_skb(skb1, FREE_READ);
4358 }4359 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4360 sk->ack_backlog++;
4361 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4362 }4363 else4364 {4365 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4366 }4367
4368 /*4369 * Now tell the user we may have some data. 4370 */4371
4372 if (!sk->dead)
4373 {4374 if(sk->debug)
4375 printk("Data wakeup.\n");
4376 sk->data_ready(sk,0);
4377 }4378 return(0);
4379 }4380
4381
4382 /*4383 * This routine is only called when we have urgent data4384 * signalled. Its the 'slow' part of tcp_urg. It could be4385 * moved inline now as tcp_urg is only called from one4386 * place. We handle URGent data wrong. We have to - as4387 * BSD still doesn't use the correction from RFC961.4388 */4389
4390 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4391 {4392 u32ptr = ntohs(th->urg_ptr);
4393
4394 if (ptr)
4395 ptr--;
4396 ptr += th->seq;
4397
4398 /* ignore urgent data that we've already seen and read */4399 if (after(sk->copied_seq, ptr))
4400 return;
4401
4402 /* do we already have a newer (or duplicate) urgent pointer? */4403 if (sk->urg_data && !after(ptr, sk->urg_seq))
4404 return;
4405
4406 /* tell the world about our new urgent pointer */4407 if (sk->proc != 0) {4408 if (sk->proc > 0) {4409 kill_proc(sk->proc, SIGURG, 1);
4410 }else{4411 kill_pg(-sk->proc, SIGURG, 1);
4412 }4413 }4414 sk->urg_data = URG_NOTYET;
4415 sk->urg_seq = ptr;
4416 }4417
4418 /*4419 * This is the 'fast' part of urgent handling.4420 */4421
4422 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4423 unsignedlongsaddr, unsignedlonglen)
4424 {4425 u32ptr;
4426
4427 /*4428 * Check if we get a new urgent pointer - normally not 4429 */4430
4431 if (th->urg)
4432 tcp_check_urg(sk,th);
4433
4434 /*4435 * Do we wait for any urgent data? - normally not4436 */4437
4438 if (sk->urg_data != URG_NOTYET)
4439 return 0;
4440
4441 /*4442 * Is the urgent pointer pointing into this packet? 4443 */4444
4445 ptr = sk->urg_seq - th->seq + th->doff*4;
4446 if (ptr >= len)
4447 return 0;
4448
4449 /*4450 * Ok, got the correct packet, update info 4451 */4452
4453 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4454 if (!sk->dead)
4455 sk->data_ready(sk,0);
4456 return 0;
4457 }4458
4459 /*4460 * This will accept the next outstanding connection. 4461 */4462
4463 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4464 {4465 structsock *newsk;
4466 structsk_buff *skb;
4467
4468 /*4469 * We need to make sure that this socket is listening,4470 * and that it has something pending.4471 */4472
4473 if (sk->state != TCP_LISTEN)
4474 {4475 sk->err = EINVAL;
4476 return(NULL);
4477 }4478
4479 /* Avoid the race. */4480 cli();
4481 sk->inuse = 1;
4482
4483 while((skb = tcp_dequeue_established(sk)) == NULL)
4484 {4485 if (flags & O_NONBLOCK)
4486 {4487 sti();
4488 release_sock(sk);
4489 sk->err = EAGAIN;
4490 return(NULL);
4491 }4492
4493 release_sock(sk);
4494 interruptible_sleep_on(sk->sleep);
4495 if (current->signal & ~current->blocked)
4496 {4497 sti();
4498 sk->err = ERESTARTSYS;
4499 return(NULL);
4500 }4501 sk->inuse = 1;
4502 }4503 sti();
4504
4505 /*4506 * Now all we need to do is return skb->sk. 4507 */4508
4509 newsk = skb->sk;
4510
4511 kfree_skb(skb, FREE_READ);
4512 sk->ack_backlog--;
4513 release_sock(sk);
4514 return(newsk);
4515 }4516
4517
4518 /*4519 * This will initiate an outgoing connection. 4520 */4521
4522 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4523 {4524 structsk_buff *buff;
4525 structdevice *dev=NULL;
4526 unsignedchar *ptr;
4527 inttmp;
4528 intatype;
4529 structtcphdr *t1;
4530 structrtable *rt;
4531
4532 if (sk->state != TCP_CLOSE)
4533 {4534 return(-EISCONN);
4535 }4536
4537 if (addr_len < 8)
4538 return(-EINVAL);
4539
4540 if (usin->sin_family && usin->sin_family != AF_INET)
4541 return(-EAFNOSUPPORT);
4542
4543 /*4544 * connect() to INADDR_ANY means loopback (BSD'ism).4545 */4546
4547 if(usin->sin_addr.s_addr==INADDR_ANY)
4548 usin->sin_addr.s_addr=ip_my_addr();
4549
4550 /*4551 * Don't want a TCP connection going to a broadcast address 4552 */4553
4554 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4555 return -ENETUNREACH;
4556
4557 sk->inuse = 1;
4558 sk->daddr = usin->sin_addr.s_addr;
4559 sk->write_seq = tcp_init_seq();
4560 sk->window_seq = sk->write_seq;
4561 sk->rcv_ack_seq = sk->write_seq -1;
4562 sk->err = 0;
4563 sk->dummy_th.dest = usin->sin_port;
4564 release_sock(sk);
4565
4566 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4567 if (buff == NULL)
4568 {4569 return(-ENOMEM);
4570 }4571 sk->inuse = 1;
4572 buff->sk = sk;
4573 buff->free = 0;
4574 buff->localroute = sk->localroute;
4575
4576
4577 /*4578 * Put in the IP header and routing stuff. 4579 */4580
4581 if (sk->localroute)
4582 rt=ip_rt_local(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4583 else4584 rt=ip_rt_route(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4585
4586 /*4587 * We need to build the routing stuff from the things saved in skb. 4588 */4589
4590 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4591 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4592 if (tmp < 0)
4593 {4594 sock_wfree(sk, buff);
4595 release_sock(sk);
4596 return(-ENETUNREACH);
4597 }4598
4599 t1 = (structtcphdr *) skb_put(buff,sizeof(structtcphdr));
4600
4601 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4602 t1->seq = ntohl(sk->write_seq++);
4603 sk->sent_seq = sk->write_seq;
4604 buff->h.seq = sk->write_seq;
4605 t1->ack = 0;
4606 t1->window = 2;
4607 t1->res1=0;
4608 t1->res2=0;
4609 t1->rst = 0;
4610 t1->urg = 0;
4611 t1->psh = 0;
4612 t1->syn = 1;
4613 t1->urg_ptr = 0;
4614 t1->doff = 6;
4615 /* use 512 or whatever user asked for */4616
4617 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4618 sk->window_clamp=rt->rt_window;
4619 else4620 sk->window_clamp=0;
4621
4622 if (sk->user_mss)
4623 sk->mtu = sk->user_mss;
4624 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
4625 sk->mtu = rt->rt_mss;
4626 else4627 {4628 #ifdefCONFIG_INET_SNARL4629 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4630 #else4631 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4632 #endif4633 sk->mtu = 576 - sizeof(structiphdr) - sizeof(structtcphdr);
4634 else4635 sk->mtu = MAX_WINDOW;
4636 }4637 /*4638 * but not bigger than device MTU 4639 */4640
4641 if(sk->mtu <32)
4642 sk->mtu = 32; /* Sanity limit */4643
4644 sk->mtu = min(sk->mtu, dev->mtu - sizeof(structiphdr) - sizeof(structtcphdr));
4645
4646 /*4647 * Put in the TCP options to say MTU. 4648 */4649
4650 ptr = skb_put(buff,4);
4651 ptr[0] = 2;
4652 ptr[1] = 4;
4653 ptr[2] = (sk->mtu) >> 8;
4654 ptr[3] = (sk->mtu) & 0xff;
4655 tcp_send_check(t1, sk->saddr, sk->daddr,
4656 sizeof(structtcphdr) + 4, sk);
4657
4658 /*4659 * This must go first otherwise a really quick response will get reset. 4660 */4661
4662 tcp_cache_zap();
4663 tcp_set_state(sk,TCP_SYN_SENT);
4664 if(rt&&rt->rt_flags&RTF_IRTT)
4665 sk->rto = rt->rt_irtt;
4666 else4667 sk->rto = TCP_TIMEOUT_INIT;
4668 sk->retransmit_timer.function=&retransmit_timer;
4669 sk->retransmit_timer.data = (unsignedlong)sk;
4670 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4671 sk->retransmits = 0; /* Now works the right way instead of a hacked initial setting */4672
4673 sk->prot->queue_xmit(sk, dev, buff, 0);
4674 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4675 tcp_statistics.TcpActiveOpens++;
4676 tcp_statistics.TcpOutSegs++;
4677
4678 release_sock(sk);
4679 return(0);
4680 }4681
4682
4683 /* This functions checks to see if the tcp header is actually acceptable. */4684 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4685 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4686 {4687 u32next_seq;
4688
4689 next_seq = len - 4*th->doff;
4690 if (th->fin)
4691 next_seq++;
4692 /* if we have a zero window, we can't have any data in the packet.. */4693 if (next_seq && !sk->window)
4694 gotoignore_it;
4695 next_seq += th->seq;
4696
4697 /*4698 * This isn't quite right. sk->acked_seq could be more recent4699 * than sk->window. This is however close enough. We will accept4700 * slightly more packets than we should, but it should not cause4701 * problems unless someone is trying to forge packets.4702 */4703
4704 /* have we already seen all of this packet? */4705 if (!after(next_seq+1, sk->acked_seq))
4706 gotoignore_it;
4707 /* or does it start beyond the window? */4708 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4709 gotoignore_it;
4710
4711 /* ok, at least part of this packet would seem interesting.. */4712 return 1;
4713
4714 ignore_it:
4715 if (th->rst)
4716 return 0;
4717
4718 /*4719 * Send a reset if we get something not ours and we are4720 * unsynchronized. Note: We don't do anything to our end. We4721 * are just killing the bogus remote connection then we will4722 * connect again and it will work (with luck).4723 */4724
4725 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4726 {4727 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4728 return 1;
4729 }4730
4731 /* Try to resync things. */4732 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4733 return 0;
4734 }4735
4736 /*4737 * When we get a reset we do this.4738 */4739
4740 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4741 {4742 sk->zapped = 1;
4743 sk->err = ECONNRESET;
4744 if (sk->state == TCP_SYN_SENT)
4745 sk->err = ECONNREFUSED;
4746 if (sk->state == TCP_CLOSE_WAIT)
4747 sk->err = EPIPE;
4748 #ifdef TCP_DO_RFC1337
4749 /*4750 * Time wait assassination protection [RFC1337]4751 */4752 if(sk->state!=TCP_TIME_WAIT)
4753 {4754 tcp_set_state(sk,TCP_CLOSE);
4755 sk->shutdown = SHUTDOWN_MASK;
4756 }4757 #else4758 tcp_set_state(sk,TCP_CLOSE);
4759 sk->shutdown = SHUTDOWN_MASK;
4760 #endif4761 if (!sk->dead)
4762 sk->state_change(sk);
4763 kfree_skb(skb, FREE_READ);
4764 release_sock(sk);
4765 return(0);
4766 }4767
4768 /*4769 * A TCP packet has arrived.4770 * skb->h.raw is the TCP header.4771 */4772
4773 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4774 __u32daddr, unsignedshortlen,
4775 __u32saddr, intredo, structinet_protocol * protocol)
4776 {4777 structtcphdr *th;
4778 structsock *sk;
4779 intsyn_ok=0;
4780
4781 tcp_statistics.TcpInSegs++;
4782 if(skb->pkt_type!=PACKET_HOST)
4783 {4784 kfree_skb(skb,FREE_READ);
4785 return(0);
4786 }4787
4788 th = skb->h.th;
4789
4790 /*4791 * Find the socket, using the last hit cache if applicable.4792 */4793
4794 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4795 {4796 sk=(structsock *)th_cache_sk;
4797 /*4798 * We think this is causing the bug so4799 */4800 if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4801 printk("Cache mismatch on TCP.\n");
4802 }4803 else4804 {4805 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4806 th_cache_saddr=saddr;
4807 th_cache_daddr=daddr;
4808 th_cache_dport=th->dest;
4809 th_cache_sport=th->source;
4810 th_cache_sk=sk;
4811 }4812
4813 /*4814 * If this socket has got a reset it's to all intents and purposes 4815 * really dead. Count closed sockets as dead.4816 *4817 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4818 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4819 * exist so should cause resets as if the port was unreachable.4820 */4821
4822 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4823 sk=NULL;
4824
4825 if (!redo)
4826 {4827 /*4828 * Pull up the IP header.4829 */4830 skb_pull(skb, skb->h.raw-skb->data);
4831 /*4832 * Try to use the device checksum if provided.4833 */4834 if (
4835 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4836 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4837 )
4838 {4839 skb->sk = NULL;
4840 kfree_skb(skb,FREE_READ);
4841 /*4842 * We don't release the socket because it was4843 * never marked in use.4844 */4845 return(0);
4846 }4847 th->seq = ntohl(th->seq);
4848
4849 /* See if we know about the socket. */4850 if (sk == NULL)
4851 {4852 /*4853 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4854 */4855 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4856 skb->sk = NULL;
4857 /*4858 * Discard frame4859 */4860 kfree_skb(skb, FREE_READ);
4861 return(0);
4862 }4863
4864 /* skb->len = len;*/4865 skb->acked = 0;
4866 skb->used = 0;
4867 skb->free = 0;
4868 skb->saddr = daddr;
4869 skb->daddr = saddr;
4870
4871 /* We may need to add it to the backlog here. */4872 cli();
4873 if (sk->inuse)
4874 {4875 skb_queue_tail(&sk->back_log, skb);
4876 sti();
4877 return(0);
4878 }4879 sk->inuse = 1;
4880 sti();
4881 }4882 else4883 {4884 if (sk==NULL)
4885 {4886 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4887 skb->sk = NULL;
4888 kfree_skb(skb, FREE_READ);
4889 return(0);
4890 }4891 }4892
4893
4894 if (!sk->prot)
4895 {4896 printk("IMPOSSIBLE 3\n");
4897 return(0);
4898 }4899
4900
4901 /*4902 * Charge the memory to the socket. 4903 */4904
4905 if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf)
4906 {4907 kfree_skb(skb, FREE_READ);
4908 release_sock(sk);
4909 return(0);
4910 }4911
4912 skb->sk=sk;
4913 sk->rmem_alloc += skb->truesize;
4914
4915 /*4916 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4917 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4918 * compatibility. We also set up variables more thoroughly [Karn notes in the4919 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4920 */4921
4922 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4923 {4924
4925 /*4926 * Now deal with unusual cases.4927 */4928
4929 if(sk->state==TCP_LISTEN)
4930 {4931 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4932 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4933
4934 /*4935 * We don't care for RST, and non SYN are absorbed (old segments)4936 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4937 * netmask on a running connection it can go broadcast. Even Sun's have4938 * this problem so I'm ignoring it 4939 */4940
4941 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4942 {4943 kfree_skb(skb, FREE_READ);
4944 release_sock(sk);
4945 return 0;
4946 }4947
4948 /* 4949 * Guess we need to make a new socket up 4950 */4951
4952 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4953
4954 /*4955 * Now we have several options: In theory there is nothing else4956 * in the frame. KA9Q has an option to send data with the syn,4957 * BSD accepts data with the syn up to the [to be] advertised window4958 * and Solaris 2.1 gives you a protocol error. For now we just ignore4959 * it, that fits the spec precisely and avoids incompatibilities. It4960 * would be nice in future to drop through and process the data.4961 */4962
4963 release_sock(sk);
4964 return 0;
4965 }4966
4967 /* retransmitted SYN? */4968 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4969 {4970 kfree_skb(skb, FREE_READ);
4971 release_sock(sk);
4972 return 0;
4973 }4974
4975 /*4976 * SYN sent means we have to look for a suitable ack and either reset4977 * for bad matches or go to connected 4978 */4979
4980 if(sk->state==TCP_SYN_SENT)
4981 {4982 /* Crossed SYN or previous junk segment */4983 if(th->ack)
4984 {4985 /* We got an ack, but it's not a good ack */4986 if(!tcp_ack(sk,th,saddr,len))
4987 {4988 /* Reset the ack - its an ack from a 4989 different connection [ th->rst is checked in tcp_reset()] */4990 tcp_statistics.TcpAttemptFails++;
4991 tcp_reset(daddr, saddr, th,
4992 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4993 kfree_skb(skb, FREE_READ);
4994 release_sock(sk);
4995 return(0);
4996 }4997 if(th->rst)
4998 returntcp_std_reset(sk,skb);
4999 if(!th->syn)
5000 {5001 /* A valid ack from a different connection5002 start. Shouldn't happen but cover it */5003 kfree_skb(skb, FREE_READ);
5004 release_sock(sk);
5005 return 0;
5006 }5007 /*5008 * Ok.. it's good. Set up sequence numbers and5009 * move to established.5010 */5011 syn_ok=1; /* Don't reset this connection for the syn */5012 sk->acked_seq=th->seq+1;
5013 sk->fin_seq=th->seq;
5014 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5015 tcp_set_state(sk, TCP_ESTABLISHED);
5016 tcp_options(sk,th);
5017 sk->dummy_th.dest=th->source;
5018 sk->copied_seq = sk->acked_seq;
5019 if(!sk->dead)
5020 {5021 sk->state_change(sk);
5022 sock_wake_async(sk->socket, 0);
5023 }5024 if(sk->max_window==0)
5025 {5026 sk->max_window = 32;
5027 sk->mss = min(sk->max_window, sk->mtu);
5028 }5029 }5030 else5031 {5032 /* See if SYN's cross. Drop if boring */5033 if(th->syn && !th->rst)
5034 {5035 /* Crossed SYN's are fine - but talking to5036 yourself is right out... */5037 if(sk->saddr==saddr && sk->daddr==daddr &&
5038 sk->dummy_th.source==th->source &&
5039 sk->dummy_th.dest==th->dest)
5040 {5041 tcp_statistics.TcpAttemptFails++;
5042 returntcp_std_reset(sk,skb);
5043 }5044 tcp_set_state(sk,TCP_SYN_RECV);
5045
5046 /*5047 * FIXME:5048 * Must send SYN|ACK here5049 */5050 }5051 /* Discard junk segment */5052 kfree_skb(skb, FREE_READ);
5053 release_sock(sk);
5054 return 0;
5055 }5056 /*5057 * SYN_RECV with data maybe.. drop through5058 */5059 gotorfc_step6;
5060 }5061
5062 /*5063 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is5064 * a more complex suggestion for fixing these reuse issues in RFC16445065 * but not yet ready for general use. Also see RFC1379.5066 */5067
5068 #defineBSD_TIME_WAIT5069 #ifdefBSD_TIME_WAIT5070 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5071 after(th->seq, sk->acked_seq) && !th->rst)
5072 {5073 u32seq = sk->write_seq;
5074 if(sk->debug)
5075 printk("Doing a BSD time wait\n");
5076 tcp_statistics.TcpEstabResets++;
5077 sk->rmem_alloc -= skb->truesize;
5078 skb->sk = NULL;
5079 sk->err=ECONNRESET;
5080 tcp_set_state(sk, TCP_CLOSE);
5081 sk->shutdown = SHUTDOWN_MASK;
5082 release_sock(sk);
5083 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5084 if (sk && sk->state==TCP_LISTEN)
5085 {5086 sk->inuse=1;
5087 skb->sk = sk;
5088 sk->rmem_alloc += skb->truesize;
5089 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5090 release_sock(sk);
5091 return 0;
5092 }5093 kfree_skb(skb, FREE_READ);
5094 return 0;
5095 }5096 #endif5097 }5098
5099 /*5100 * We are now in normal data flow (see the step list in the RFC)5101 * Note most of these are inline now. I'll inline the lot when5102 * I have time to test it hard and look at what gcc outputs 5103 */5104
5105 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5106 {5107 kfree_skb(skb, FREE_READ);
5108 release_sock(sk);
5109 return 0;
5110 }5111
5112 if(th->rst)
5113 returntcp_std_reset(sk,skb);
5114
5115 /*5116 * !syn_ok is effectively the state test in RFC793.5117 */5118
5119 if(th->syn && !syn_ok)
5120 {5121 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5122 returntcp_std_reset(sk,skb);
5123 }5124
5125 /*5126 * Process the ACK5127 */5128
5129
5130 if(th->ack && !tcp_ack(sk,th,saddr,len))
5131 {5132 /*5133 * Our three way handshake failed.5134 */5135
5136 if(sk->state==TCP_SYN_RECV)
5137 {5138 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5139 }5140 kfree_skb(skb, FREE_READ);
5141 release_sock(sk);
5142 return 0;
5143 }5144
5145 rfc_step6: /* I'll clean this up later */5146
5147 /*5148 * Process urgent data5149 */5150
5151 if(tcp_urg(sk, th, saddr, len))
5152 {5153 kfree_skb(skb, FREE_READ);
5154 release_sock(sk);
5155 return 0;
5156 }5157
5158
5159 /*5160 * Process the encapsulated data5161 */5162
5163 if(tcp_data(skb,sk, saddr, len))
5164 {5165 kfree_skb(skb, FREE_READ);
5166 release_sock(sk);
5167 return 0;
5168 }5169
5170 /*5171 * And done5172 */5173
5174 release_sock(sk);
5175 return 0;
5176 }5177
5178 /*5179 * This routine sends a packet with an out of date sequence5180 * number. It assumes the other end will try to ack it.5181 */5182
5183 staticvoidtcp_write_wakeup(structsock *sk)
/* */5184 {5185 structsk_buff *buff,*skb;
5186 structtcphdr *t1;
5187 structdevice *dev=NULL;
5188 inttmp;
5189
5190 if (sk->zapped)
5191 return; /* After a valid reset we can send no more */5192
5193 /*5194 * Write data can still be transmitted/retransmitted in the5195 * following states. If any other state is encountered, return.5196 * [listen/close will never occur here anyway]5197 */5198
5199 if (sk->state != TCP_ESTABLISHED &&
5200 sk->state != TCP_CLOSE_WAIT &&
5201 sk->state != TCP_FIN_WAIT1 &&
5202 sk->state != TCP_LAST_ACK &&
5203 sk->state != TCP_CLOSING5204 )
5205 {5206 return;
5207 }5208 if ( before(sk->sent_seq, sk->window_seq) &&
5209 (skb=skb_peek(&sk->write_queue)))
5210 {5211 /*5212 * We are probing the opening of a window5213 * but the window size is != 05214 * must have been a result SWS advoidance ( sender )5215 */5216
5217 structiphdr *iph;
5218 structtcphdr *th;
5219 structtcphdr *nth;
5220 unsignedlongwin_size;
5221 #if 0
5222 unsignedlongow_size;
5223 #endif5224 void * tcp_data_start;
5225
5226 /*5227 * How many bytes can we send ?5228 */5229
5230 win_size = sk->window_seq - sk->sent_seq;
5231
5232 /*5233 * Recover the buffer pointers5234 */5235
5236 iph = (structiphdr *)skb->ip_hdr;
5237 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
5238
5239 /*5240 * Grab the data for a temporary frame5241 */5242
5243 buff = sock_wmalloc(sk, win_size + th->doff * 4 +
5244 (iph->ihl << 2) +
5245 sk->prot->max_header + 15,
5246 1, GFP_ATOMIC);
5247 if ( buff == NULL )
5248 return;
5249
5250 /* 5251 * If we strip the packet on the write queue we must5252 * be ready to retransmit this one 5253 */5254
5255 buff->free = /*0*/1;
5256
5257 buff->sk = sk;
5258 buff->localroute = sk->localroute;
5259
5260 /*5261 * Put headers on the new packet5262 */5263
5264 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5265 IPPROTO_TCP, sk->opt, buff->truesize,
5266 sk->ip_tos,sk->ip_ttl);
5267 if (tmp < 0)
5268 {5269 sock_wfree(sk, buff);
5270 return;
5271 }5272
5273 /*5274 * Move the TCP header over5275 */5276
5277 buff->dev = dev;
5278
5279 nth = (structtcphdr *) skb_put(buff,th->doff*4);
5280
5281 memcpy(nth, th, th->doff * 4);
5282
5283 /*5284 * Correct the new header5285 */5286
5287 nth->ack = 1;
5288 nth->ack_seq = ntohl(sk->acked_seq);
5289 nth->window = ntohs(tcp_select_window(sk));
5290 nth->check = 0;
5291
5292 /*5293 * Find the first data byte.5294 */5295
5296 tcp_data_start = skb->data + skb->dev->hard_header_len +
5297 (iph->ihl << 2) + th->doff * 4;
5298
5299 /*5300 * Add it to our new buffer5301 */5302 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5303
5304 /*5305 * Remember our right edge sequence number.5306 */5307
5308 buff->h.seq = sk->sent_seq + win_size;
5309 sk->sent_seq = buff->h.seq; /* Hack */5310 #if 0
5311
5312 /*5313 * now: shrink the queue head segment 5314 */5315
5316 th->check = 0;
5317 ow_size = skb->len - win_size -
5318 ((unsignedlong) (tcp_data_start - (void *) skb->data));
5319
5320 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5321 skb_trim(skb,skb->len-win_size);
5322 sk->sent_seq += win_size;
5323 th->seq = htonl(sk->sent_seq);
5324 if (th->urg)
5325 {5326 unsignedshorturg_ptr;
5327
5328 urg_ptr = ntohs(th->urg_ptr);
5329 if (urg_ptr <= win_size)
5330 th->urg = 0;
5331 else5332 {5333 urg_ptr -= win_size;
5334 th->urg_ptr = htons(urg_ptr);
5335 nth->urg_ptr = htons(win_size);
5336 }5337 }5338 #else5339 if(th->urg && ntohs(th->urg_ptr) < win_size)
5340 nth->urg = 0;
5341 #endif5342
5343 /*5344 * Checksum the split buffer5345 */5346
5347 tcp_send_check(nth, sk->saddr, sk->daddr,
5348 nth->doff * 4 + win_size , sk);
5349 }5350 else5351 {5352 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5353 if (buff == NULL)
5354 return;
5355
5356 buff->free = 1;
5357 buff->sk = sk;
5358 buff->localroute = sk->localroute;
5359
5360 /*5361 * Put in the IP header and routing stuff. 5362 */5363
5364 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5365 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5366 if (tmp < 0)
5367 {5368 sock_wfree(sk, buff);
5369 return;
5370 }5371
5372 t1 = (structtcphdr *)skb_put(buff,sizeof(structtcphdr));
5373 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5374
5375 /*5376 * Use a previous sequence.5377 * This should cause the other end to send an ack.5378 */5379
5380 t1->seq = htonl(sk->sent_seq-1);
5381 t1->ack = 1;
5382 t1->res1= 0;
5383 t1->res2= 0;
5384 t1->rst = 0;
5385 t1->urg = 0;
5386 t1->psh = 0;
5387 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5388 t1->syn = 0;
5389 t1->ack_seq = ntohl(sk->acked_seq);
5390 t1->window = ntohs(tcp_select_window(sk));
5391 t1->doff = sizeof(*t1)/4;
5392 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5393
5394 }5395
5396 /*5397 * Send it.5398 */5399
5400 sk->prot->queue_xmit(sk, dev, buff, 1);
5401 tcp_statistics.TcpOutSegs++;
5402 }5403
5404 /*5405 * A window probe timeout has occurred.5406 */5407
5408 voidtcp_send_probe0(structsock *sk)
/* */5409 {5410 if (sk->zapped)
5411 return; /* After a valid reset we can send no more */5412
5413 tcp_write_wakeup(sk);
5414
5415 sk->backoff++;
5416 sk->rto = min(sk->rto << 1, 120*HZ);
5417 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5418 sk->retransmits++;
5419 sk->prot->retransmits ++;
5420 }5421
5422 /*5423 * Socket option code for TCP. 5424 */5425
5426 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5427 {5428 intval,err;
5429
5430 if(level!=SOL_TCP)
5431 returnip_setsockopt(sk,level,optname,optval,optlen);
5432
5433 if (optval == NULL)
5434 return(-EINVAL);
5435
5436 err=verify_area(VERIFY_READ, optval, sizeof(int));
5437 if(err)
5438 returnerr;
5439
5440 val = get_user((int *)optval);
5441
5442 switch(optname)
5443 {5444 caseTCP_MAXSEG:
5445 /*5446 * values greater than interface MTU won't take effect. however at5447 * the point when this call is done we typically don't yet know5448 * which interface is going to be used5449 */5450 if(val<1||val>MAX_WINDOW)
5451 return -EINVAL;
5452 sk->user_mss=val;
5453 return 0;
5454 caseTCP_NODELAY:
5455 sk->nonagle=(val==0)?0:1;
5456 return 0;
5457 default:
5458 return(-ENOPROTOOPT);
5459 }5460 }5461
5462 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5463 {5464 intval,err;
5465
5466 if(level!=SOL_TCP)
5467 returnip_getsockopt(sk,level,optname,optval,optlen);
5468
5469 switch(optname)
5470 {5471 caseTCP_MAXSEG:
5472 val=sk->user_mss;
5473 break;
5474 caseTCP_NODELAY:
5475 val=sk->nonagle;
5476 break;
5477 default:
5478 return(-ENOPROTOOPT);
5479 }5480 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5481 if(err)
5482 returnerr;
5483 put_user(sizeof(int),(int *) optlen);
5484
5485 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5486 if(err)
5487 returnerr;
5488 put_user(val,(int *)optval);
5489
5490 return(0);
5491 }5492
5493
5494 structprototcp_prot = {5495 tcp_close,
5496 tcp_read,
5497 tcp_write,
5498 tcp_sendto,
5499 tcp_recvfrom,
5500 ip_build_header,
5501 tcp_connect,
5502 tcp_accept,
5503 ip_queue_xmit,
5504 tcp_retransmit,
5505 tcp_write_wakeup,
5506 tcp_read_wakeup,
5507 tcp_rcv,
5508 tcp_select,
5509 tcp_ioctl,
5510 NULL,
5511 tcp_shutdown,
5512 tcp_setsockopt,
5513 tcp_getsockopt,
5514 tcp_sendmsg,
5515 tcp_recvmsg,
5516 128,
5517 0,
5518 "TCP",
5519 0, 0,
5520 {NULL,}5521 };