1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 26 * and was trying to connect (tcp_err()). 27 * Alan Cox : All icmp error handling was broken 28 * pointers passed where wrong and the 29 * socket was looked up backwards. Nobody 30 * tested any icmp error code obviously. 31 * Alan Cox : tcp_err() now handled properly. It wakes people 32 * on errors. select behaves and the icmp error race 33 * has gone by moving it into sock.c 34 * Alan Cox : tcp_reset() fixed to work for everything not just 35 * packets for unknown sockets. 36 * Alan Cox : tcp option processing. 37 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] 38 * Herp Rosmanith : More reset fixes 39 * Alan Cox : No longer acks invalid rst frames. Acking 40 * any kind of RST is right out. 41 * Alan Cox : Sets an ignore me flag on an rst receive 42 * otherwise odd bits of prattle escape still 43 * Alan Cox : Fixed another acking RST frame bug. Should stop 44 * LAN workplace lockups. 45 * Alan Cox : Some tidyups using the new skb list facilities 46 * Alan Cox : sk->keepopen now seems to work 47 * Alan Cox : Pulls options out correctly on accepts 48 * Alan Cox : Fixed assorted sk->rqueue->next errors 49 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. 50 * Alan Cox : Tidied tcp_data to avoid a potential nasty. 51 * Alan Cox : Added some better commenting, as the tcp is hard to follow 52 * Alan Cox : Removed incorrect check for 20 * psh 53 * Michael O'Reilly : ack < copied bug fix. 54 * Johannes Stille : Misc tcp fixes (not all in yet). 55 * Alan Cox : FIN with no memory -> CRASH 56 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. 57 * Alan Cox : Added TCP options (SOL_TCP) 58 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. 59 * Alan Cox : Use ip_tos/ip_ttl settings. 60 * Alan Cox : Handle FIN (more) properly (we hope). 61 * Alan Cox : RST frames sent on unsynchronised state ack error/ 62 * Alan Cox : Put in missing check for SYN bit. 63 * Alan Cox : Added tcp_select_window() aka NET2E 64 * window non shrink trick. 65 * Alan Cox : Added a couple of small NET2E timer fixes 66 * Charles Hedrick : TCP fixes 67 * Toomas Tamm : TCP window fixes 68 * Alan Cox : Small URG fix to rlogin ^C ack fight 69 * Charles Hedrick : Rewrote most of it to actually work 70 * Linus : Rewrote tcp_read() and URG handling 71 * completely 72 * Gerhard Koerting: Fixed some missing timer handling 73 * Matthew Dillon : Reworked TCP machine states as per RFC 74 * Gerhard Koerting: PC/TCP workarounds 75 * Adam Caldwell : Assorted timer/timing errors 76 * Matthew Dillon : Fixed another RST bug 77 * Alan Cox : Move to kernel side addressing changes. 78 * Alan Cox : Beginning work on TCP fastpathing (not yet usable) 79 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 80 * Alan Cox : TCP fast path debugging 81 * Alan Cox : Window clamping 82 * Michael Riepe : Bug in tcp_check() 83 * Matt Dillon : More TCP improvements and RST bug fixes 84 * Matt Dillon : Yet more small nasties remove from the TCP code 85 * (Be very nice to this man if tcp finally works 100%) 8) 86 * Alan Cox : BSD accept semantics. 87 * Alan Cox : Reset on closedown bug. 88 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 89 * Michael Pall : Handle select() after URG properly in all cases. 90 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). 91 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. 92 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. 93 * Alan Cox : Changed the semantics of sk->socket to 94 * fix a race and a signal problem with 95 * accept() and async I/O. 96 * Alan Cox : Relaxed the rules on tcp_sendto(). 97 * Yury Shevchuk : Really fixed accept() blocking problem. 98 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 99 * clients/servers which listen in on 100 * fixed ports. 101 * Alan Cox : Cleaned the above up and shrank it to 102 * a sensible code size. 103 * Alan Cox : Self connect lockup fix. 104 * Alan Cox : No connect to multicast. 105 * Ross Biro : Close unaccepted children on master 106 * socket close. 107 * Alan Cox : Reset tracing code. 108 * Alan Cox : Spurious resets on shutdown. 109 * Alan Cox : Giant 15 minute/60 second timer error 110 * Alan Cox : Small whoops in selecting before an accept. 111 * Alan Cox : Kept the state trace facility since it's 112 * handy for debugging. 113 * Alan Cox : More reset handler fixes. 114 * Alan Cox : Started rewriting the code based on the RFC's 115 * for other useful protocol references see: 116 * Comer, KA9Q NOS, and for a reference on the 117 * difference between specifications and how BSD 118 * works see the 4.4lite source. 119 * A.N.Kuznetsov : Don't time wait on completion of tidy 120 * close. 121 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 122 * Linus Torvalds : Fixed BSD port reuse to work first syn 123 * Alan Cox : Reimplemented timers as per the RFC and using multiple 124 * timers for sanity. 125 * Alan Cox : Small bug fixes, and a lot of new 126 * comments. 127 * Alan Cox : Fixed dual reader crash by locking 128 * the buffers (much like datagram.c) 129 * Alan Cox : Fixed stuck sockets in probe. A probe 130 * now gets fed up of retrying without 131 * (even a no space) answer. 132 * Alan Cox : Extracted closing code better 133 * Alan Cox : Fixed the closing state machine to 134 * resemble the RFC. 135 * Alan Cox : More 'per spec' fixes. 136 * Jorge Cwik : Even faster checksumming. 137 * Alan Cox : tcp_data() doesn't ack illegal PSH 138 * only frames. At least one pc tcp stack 139 * generates them. 140 * Alan Cox : Cache last socket. 141 * Alan Cox : Per route irtt. 142 * Matt Day : Select() match BSD precisely on error 143 * 144 * 145 * To Fix: 146 * Fast path the code. Two things here - fix the window calculation 147 * so it doesn't iterate over the queue, also spot packets with no funny 148 * options arriving in order and process directly. 149 * 150 * Implement RFC 1191 [Path MTU discovery] 151 * Look at the effect of implementing RFC 1337 suggestions and their impact. 152 * Rewrite output state machine to use a single queue and do low window 153 * situations as per the spec (RFC 1122) 154 * Speed up input assembly algorithm. 155 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 156 * could do with it working on IPv4 157 * User settable/learned rtt/max window/mtu 158 * Cope with MTU/device switches when retransmitting in tcp. 159 * Fix the window handling to use PR's new code. 160 * 161 * Change the fundamental structure to a single send queue maintained 162 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 163 * active routes too]). Cut the queue off in tcp_retransmit/ 164 * tcp_transmit. 165 * Change the receive queue to assemble as it goes. This lets us 166 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 167 * tcp_data/tcp_read as well as the window shrink crud. 168 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 169 * tcp_queue_skb seem obvious routines to extract. 170 * 171 * This program is free software; you can redistribute it and/or 172 * modify it under the terms of the GNU General Public License 173 * as published by the Free Software Foundation; either version 174 * 2 of the License, or(at your option) any later version. 175 * 176 * Description of States: 177 * 178 * TCP_SYN_SENT sent a connection request, waiting for ack 179 * 180 * TCP_SYN_RECV received a connection request, sent ack, 181 * waiting for final ack in three-way handshake. 182 * 183 * TCP_ESTABLISHED connection established 184 * 185 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 186 * transmission of remaining buffered data 187 * 188 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 189 * to shutdown 190 * 191 * TCP_CLOSING both sides have shutdown but we still have 192 * data we have to finish sending 193 * 194 * TCP_TIME_WAIT timeout to catch resent junk before entering 195 * closed, can only be entered from FIN_WAIT2 196 * or CLOSING. Required because the other end 197 * may not have gotten our last ACK causing it 198 * to retransmit the data packet (which we ignore) 199 * 200 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 201 * us to finish writing our data and to shutdown 202 * (we have to close() to move on to LAST_ACK) 203 * 204 * TCP_LAST_ACK out side has shutdown after remote has 205 * shutdown. There may still be data in our 206 * buffer that we have to finish sending 207 * 208 * TCP_CLOSE socket is finished 209 */ 210
211 #include <linux/types.h>
212 #include <linux/sched.h>
213 #include <linux/mm.h>
214 #include <linux/time.h>
215 #include <linux/string.h>
216 #include <linux/config.h>
217 #include <linux/socket.h>
218 #include <linux/sockios.h>
219 #include <linux/termios.h>
220 #include <linux/in.h>
221 #include <linux/fcntl.h>
222 #include <linux/inet.h>
223 #include <linux/netdevice.h>
224 #include <net/snmp.h>
225 #include <net/ip.h>
226 #include <net/protocol.h>
227 #include <net/icmp.h>
228 #include <net/tcp.h>
229 #include <net/arp.h>
230 #include <linux/skbuff.h>
231 #include <net/sock.h>
232 #include <net/route.h>
233 #include <linux/errno.h>
234 #include <linux/timer.h>
235 #include <asm/system.h>
236 #include <asm/segment.h>
237 #include <linux/mm.h>
238 #include <net/checksum.h>
239
240 /* 241 * The MSL timer is the 'normal' timer. 242 */ 243
244 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
245
246 #define SEQ_TICK 3
247 unsignedlongseq_offset;
248 structtcp_mibtcp_statistics;
249
250 /* 251 * Cached last hit socket 252 */ 253
254 volatileunsignedlongth_cache_saddr,th_cache_daddr;
255 volatileunsignedshortth_cache_dport, th_cache_sport;
256 volatilestructsock *th_cache_sk;
257
258 voidtcp_cache_zap(void)
/* */ 259 { 260 unsignedlongflags;
261 save_flags(flags);
262 cli();
263 th_cache_saddr=0;
264 th_cache_daddr=0;
265 th_cache_dport=0;
266 th_cache_sport=0;
267 th_cache_sk=NULL;
268 restore_flags(flags);
269 } 270
271 staticvoidtcp_close(structsock *sk, inttimeout);
272
273
274 /* 275 * The less said about this the better, but it works and will do for 1.2 276 */ 277
278 staticstructwait_queue *master_select_wakeup;
279
280 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 281 { 282 if (a < b)
283 return(a);
284 return(b);
285 } 286
287 #undefSTATE_TRACE 288
289 #ifdefSTATE_TRACE 290 staticchar *statename[]={ 291 "Unused","Established","Syn Sent","Syn Recv",
292 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
293 "Close Wait","Last ACK","Listen","Closing"
294 };
295 #endif 296
297 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 298 { 299 if(sk->state==TCP_ESTABLISHED)
300 tcp_statistics.TcpCurrEstab--;
301 #ifdefSTATE_TRACE 302 if(sk->debug)
303 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
304 #endif 305 /* This is a hack but it doesn't occur often and it's going to 306 be a real to fix nicely */ 307
308 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
309 { 310 wake_up_interruptible(&master_select_wakeup);
311 } 312 sk->state=state;
313 if(state==TCP_ESTABLISHED)
314 tcp_statistics.TcpCurrEstab++;
315 } 316
317 /* 318 * This routine picks a TCP windows for a socket based on 319 * the following constraints 320 * 321 * 1. The window can never be shrunk once it is offered (RFC 793) 322 * 2. We limit memory per socket 323 * 324 * For now we use NET2E3's heuristic of offering half the memory 325 * we have handy. All is not as bad as this seems however because 326 * of two things. Firstly we will bin packets even within the window 327 * in order to get the data we are waiting for into the memory limit. 328 * Secondly we bin common duplicate forms at receive time 329 * Better heuristics welcome 330 */ 331
332 inttcp_select_window(structsock *sk)
/* */ 333 { 334 intnew_window = sk->prot->rspace(sk);
335
336 if(sk->window_clamp)
337 new_window=min(sk->window_clamp,new_window);
338 /* 339 * Two things are going on here. First, we don't ever offer a 340 * window less than min(sk->mss, MAX_WINDOW/2). This is the 341 * receiver side of SWS as specified in RFC1122. 342 * Second, we always give them at least the window they 343 * had before, in order to avoid retracting window. This 344 * is technically allowed, but RFC1122 advises against it and 345 * in practice it causes trouble. 346 * 347 * Fixme: This doesn't correctly handle the case where 348 * new_window > sk->window but not by enough to allow for the 349 * shift in sequence space. 350 */ 351 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
352 return(sk->window);
353 return(new_window);
354 } 355
356 /* 357 * Find someone to 'accept'. Must be called with 358 * sk->inuse=1 or cli() 359 */ 360
361 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 362 { 363 structsk_buff *p=skb_peek(&s->receive_queue);
364 if(p==NULL)
365 returnNULL;
366 do 367 { 368 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
369 returnp;
370 p=p->next;
371 } 372 while(p!=(structsk_buff *)&s->receive_queue);
373 returnNULL;
374 } 375
376 /* 377 * Remove a completed connection and return it. This is used by 378 * tcp_accept() to get connections from the queue. 379 */ 380
381 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 382 { 383 structsk_buff *skb;
384 unsignedlongflags;
385 save_flags(flags);
386 cli();
387 skb=tcp_find_established(s);
388 if(skb!=NULL)
389 skb_unlink(skb); /* Take it off the queue */ 390 restore_flags(flags);
391 returnskb;
392 } 393
394 /* 395 * This routine closes sockets which have been at least partially 396 * opened, but not yet accepted. Currently it is only called by 397 * tcp_close, and timeout mirrors the value there. 398 */ 399
400 staticvoidtcp_close_pending (structsock *sk)
/* */ 401 { 402 structsk_buff *skb;
403
404 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
405 { 406 skb->sk->dead=1;
407 tcp_close(skb->sk, 0);
408 kfree_skb(skb, FREE_READ);
409 } 410 return;
411 } 412
413 /* 414 * Enter the time wait state. 415 */ 416
417 staticvoidtcp_time_wait(structsock *sk)
/* */ 418 { 419 tcp_set_state(sk,TCP_TIME_WAIT);
420 sk->shutdown = SHUTDOWN_MASK;
421 if (!sk->dead)
422 sk->state_change(sk);
423 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
424 } 425
426 /* 427 * A socket has timed out on its send queue and wants to do a 428 * little retransmitting. Currently this means TCP. 429 */ 430
431 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 432 { 433 structsk_buff * skb;
434 structproto *prot;
435 structdevice *dev;
436 intct=0;
437
438 prot = sk->prot;
439 skb = sk->send_head;
440
441 while (skb != NULL)
442 { 443 structtcphdr *th;
444 structiphdr *iph;
445 intsize;
446
447 dev = skb->dev;
448 IS_SKB(skb);
449 skb->when = jiffies;
450
451 /* 452 * In general it's OK just to use the old packet. However we 453 * need to use the current ack and window fields. Urg and 454 * urg_ptr could possibly stand to be updated as well, but we 455 * don't keep the necessary data. That shouldn't be a problem, 456 * if the other end is doing the right thing. Since we're 457 * changing the packet, we have to issue a new IP identifier. 458 */ 459
460 iph = (structiphdr *)(skb->data + dev->hard_header_len);
461 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
462 size = skb->len - (((unsignedchar *) th) - skb->data);
463
464 /* 465 * Note: We ought to check for window limits here but 466 * currently this is done (less efficiently) elsewhere. 467 * We do need to check for a route change but can't handle 468 * that until we have the new 1.3.x buffers in. 469 * 470 */ 471
472 iph->id = htons(ip_id_count++);
473 ip_send_check(iph);
474
475 /* 476 * This is not the right way to handle this. We have to 477 * issue an up to date window and ack report with this 478 * retransmit to keep the odd buggy tcp that relies on 479 * the fact BSD does this happy. 480 * We don't however need to recalculate the entire 481 * checksum, so someone wanting a small problem to play 482 * with might like to implement RFC1141/RFC1624 and speed 483 * this up by avoiding a full checksum. 484 */ 485
486 th->ack_seq = ntohl(sk->acked_seq);
487 th->window = ntohs(tcp_select_window(sk));
488 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
489
490 /* 491 * If the interface is (still) up and running, kick it. 492 */ 493
494 if (dev->flags & IFF_UP)
495 { 496 /* 497 * If the packet is still being sent by the device/protocol 498 * below then don't retransmit. This is both needed, and good - 499 * especially with connected mode AX.25 where it stops resends 500 * occurring of an as yet unsent anyway frame! 501 * We still add up the counts as the round trip time wants 502 * adjusting. 503 */ 504 if (sk && !skb_device_locked(skb))
505 { 506 /* Remove it from any existing driver queue first! */ 507 skb_unlink(skb);
508 /* Now queue it */ 509 ip_statistics.IpOutRequests++;
510 dev_queue_xmit(skb, dev, sk->priority);
511 } 512 } 513
514 /* 515 * Count retransmissions 516 */ 517
518 ct++;
519 sk->prot->retransmits ++;
520
521 /* 522 * Only one retransmit requested. 523 */ 524
525 if (!all)
526 break;
527
528 /* 529 * This should cut it off before we send too many packets. 530 */ 531
532 if (ct >= sk->cong_window)
533 break;
534 skb = skb->link3;
535 } 536 } 537
538 /* 539 * Reset the retransmission timer 540 */ 541
542 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 543 { 544 del_timer(&sk->retransmit_timer);
545 sk->ip_xmit_timeout = why;
546 if((int)when < 0)
547 { 548 when=3;
549 printk("Error: Negative timer in xmit_timer\n");
550 } 551 sk->retransmit_timer.expires=when;
552 add_timer(&sk->retransmit_timer);
553 } 554
555 /* 556 * This is the normal code called for timeouts. It does the retransmission 557 * and then does backoff. tcp_do_retransmit is separated out because 558 * tcp_ack needs to send stuff from the retransmit queue without 559 * initiating a backoff. 560 */ 561
562
563 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 564 { 565 tcp_do_retransmit(sk, all);
566
567 /* 568 * Increase the timeout each time we retransmit. Note that 569 * we do not increase the rtt estimate. rto is initialized 570 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 571 * that doubling rto each time is the least we can get away with. 572 * In KA9Q, Karn uses this for the first few times, and then 573 * goes to quadratic. netBSD doubles, but only goes up to *64, 574 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 575 * defined in the protocol as the maximum possible RTT. I guess 576 * we'll have to use something other than TCP to talk to the 577 * University of Mars. 578 * 579 * PAWS allows us longer timeouts and large windows, so once 580 * implemented ftp to mars will work nicely. We will have to fix 581 * the 120 second clamps though! 582 */ 583
584 sk->retransmits++;
585 sk->backoff++;
586 sk->rto = min(sk->rto << 1, 120*HZ);
587 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
588 } 589
590
591 /* 592 * A timer event has trigger a tcp retransmit timeout. The 593 * socket xmit queue is ready and set up to send. Because 594 * the ack receive code keeps the queue straight we do 595 * nothing clever here. 596 */ 597
598 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 599 { 600 if (all)
601 { 602 tcp_retransmit_time(sk, all);
603 return;
604 } 605
606 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 607 /* sk->ssthresh in theory can be zero. I guess that's OK */ 608 sk->cong_count = 0;
609
610 sk->cong_window = 1;
611
612 /* Do the actual retransmit. */ 613 tcp_retransmit_time(sk, all);
614 } 615
616 /* 617 * A write timeout has occurred. Process the after effects. 618 */ 619
620 staticinttcp_write_timeout(structsock *sk)
/* */ 621 { 622 /* 623 * Look for a 'soft' timeout. 624 */ 625 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
626 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
627 { 628 /* 629 * Attempt to recover if arp has changed (unlikely!) or 630 * a route has shifted (not supported prior to 1.3). 631 */ 632 arp_destroy (sk->daddr, 0);
633 /*ip_route_check (sk->daddr);*/ 634 } 635 /* 636 * Has it gone just too far ? 637 */ 638 if (sk->retransmits > TCP_RETR2)
639 { 640 sk->err = ETIMEDOUT;
641 sk->error_report(sk);
642 del_timer(&sk->retransmit_timer);
643 /* 644 * Time wait the socket 645 */ 646 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
647 { 648 tcp_set_state(sk,TCP_TIME_WAIT);
649 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
650 } 651 else 652 { 653 /* 654 * Clean up time. 655 */ 656 tcp_set_state(sk, TCP_CLOSE);
657 return 0;
658 } 659 } 660 return 1;
661 } 662
663 /* 664 * The TCP retransmit timer. This lacks a few small details. 665 * 666 * 1. An initial rtt timeout on the probe0 should cause what we can 667 * of the first write queue buffer to be split and sent. 668 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 669 * ETIMEDOUT if we know an additional 'soft' error caused this. 670 * tcp_err should save a 'soft error' for us. 671 */ 672
673 staticvoidretransmit_timer(unsignedlongdata)
/* */ 674 { 675 structsock *sk = (structsock*)data;
676 intwhy = sk->ip_xmit_timeout;
677
678 /* 679 * only process if socket is not in use 680 */ 681
682 cli();
683 if (sk->inuse || in_bh)
684 { 685 /* Try again in 1 second */ 686 sk->retransmit_timer.expires = HZ;
687 add_timer(&sk->retransmit_timer);
688 sti();
689 return;
690 } 691
692 sk->inuse = 1;
693 sti();
694
695 /* Always see if we need to send an ack. */ 696
697 if (sk->ack_backlog && !sk->zapped)
698 { 699 sk->prot->read_wakeup (sk);
700 if (! sk->dead)
701 sk->data_ready(sk,0);
702 } 703
704 /* Now we need to figure out why the socket was on the timer. */ 705
706 switch (why)
707 { 708 /* Window probing */ 709 caseTIME_PROBE0:
710 tcp_send_probe0(sk);
711 tcp_write_timeout(sk);
712 break;
713 /* Retransmitting */ 714 caseTIME_WRITE:
715 /* It could be we got here because we needed to send an ack. 716 * So we need to check for that. 717 */ 718 { 719 structsk_buff *skb;
720 unsignedlongflags;
721
722 save_flags(flags);
723 cli();
724 skb = sk->send_head;
725 if (!skb)
726 { 727 restore_flags(flags);
728 } 729 else 730 { 731 /* 732 * Kicked by a delayed ack. Reset timer 733 * correctly now 734 */ 735 if (jiffies < skb->when + sk->rto)
736 { 737 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
738 restore_flags(flags);
739 break;
740 } 741 restore_flags(flags);
742 /* 743 * Retransmission 744 */ 745 sk->prot->retransmit (sk, 0);
746 tcp_write_timeout(sk);
747 } 748 break;
749 } 750 /* Sending Keepalives */ 751 caseTIME_KEEPOPEN:
752 /* 753 * this reset_timer() call is a hack, this is not 754 * how KEEPOPEN is supposed to work. 755 */ 756 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
757
758 /* Send something to keep the connection open. */ 759 if (sk->prot->write_wakeup)
760 sk->prot->write_wakeup (sk);
761 sk->retransmits++;
762 tcp_write_timeout(sk);
763 break;
764 default:
765 printk ("rexmit_timer: timer expired - reason unknown\n");
766 break;
767 } 768 release_sock(sk);
769 } 770
771 /* 772 * This routine is called by the ICMP module when it gets some 773 * sort of error condition. If err < 0 then the socket should 774 * be closed and the error returned to the user. If err > 0 775 * it's just the icmp type << 8 | icmp code. After adjustment 776 * header points to the first 8 bytes of the tcp header. We need 777 * to find the appropriate port. 778 */ 779
780 voidtcp_err(interr, unsignedchar *header, unsignedlongdaddr,
/* */ 781 unsignedlongsaddr, structinet_protocol *protocol)
782 { 783 structtcphdr *th;
784 structsock *sk;
785 structiphdr *iph=(structiphdr *)header;
786
787 header+=4*iph->ihl;
788
789
790 th =(structtcphdr *)header;
791 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
792
793 if (sk == NULL)
794 return;
795
796 if(err<0)
797 { 798 sk->err = -err;
799 sk->error_report(sk);
800 return;
801 } 802
803 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
804 { 805 /* 806 * FIXME: 807 * For now we will just trigger a linear backoff. 808 * The slow start code should cause a real backoff here. 809 */ 810 if (sk->cong_window > 4)
811 sk->cong_window--;
812 return;
813 } 814
815 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ 816
817 /* 818 * If we've already connected we will keep trying 819 * until we time out, or the user gives up. 820 */ 821
822 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
823 { 824 if (sk->state == TCP_SYN_SENT)
825 { 826 tcp_statistics.TcpAttemptFails++;
827 tcp_set_state(sk,TCP_CLOSE);
828 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 829 } 830 sk->err = icmp_err_convert[err & 0xff].errno;
831 } 832 return;
833 } 834
835
836 /* 837 * Walk down the receive queue counting readable data until we hit the end or we find a gap 838 * in the received data queue (ie a frame missing that needs sending to us). Not 839 * sorting using two queues as data arrives makes life so much harder. 840 */ 841
842 staticinttcp_readable(structsock *sk)
/* */ 843 { 844 unsignedlongcounted;
845 unsignedlongamount;
846 structsk_buff *skb;
847 intsum;
848 unsignedlongflags;
849
850 if(sk && sk->debug)
851 printk("tcp_readable: %p - ",sk);
852
853 save_flags(flags);
854 cli();
855 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
856 { 857 restore_flags(flags);
858 if(sk && sk->debug)
859 printk("empty\n");
860 return(0);
861 } 862
863 counted = sk->copied_seq; /* Where we are at the moment */ 864 amount = 0;
865
866 /* 867 * Do until a push or until we are out of data. 868 */ 869
870 do 871 { 872 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ 873 break;
874 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 875 if (skb->h.th->syn)
876 sum++;
877 if (sum > 0)
878 {/* Add it up, move on */ 879 amount += sum;
880 if (skb->h.th->syn)
881 amount--;
882 counted += sum;
883 } 884 /* 885 * Don't count urg data ... but do it in the right place! 886 * Consider: "old_data (ptr is here) URG PUSH data" 887 * The old code would stop at the first push because 888 * it counted the urg (amount==1) and then does amount-- 889 * *after* the loop. This means tcp_readable() always 890 * returned zero if any URG PUSH was in the queue, even 891 * though there was normal data available. If we subtract 892 * the urg data right here, we even get it to work for more 893 * than one URG PUSH skb without normal data. 894 * This means that select() finally works now with urg data 895 * in the queue. Note that rlogin was never affected 896 * because it doesn't use select(); it uses two processes 897 * and a blocking read(). And the queue scan in tcp_read() 898 * was correct. Mike <pall@rz.uni-karlsruhe.de> 899 */ 900 if (skb->h.th->urg)
901 amount--; /* don't count urg data */ 902 if (amount && skb->h.th->psh) break;
903 skb = skb->next;
904 } 905 while(skb != (structsk_buff *)&sk->receive_queue);
906
907 restore_flags(flags);
908 if(sk->debug)
909 printk("got %lu bytes.\n",amount);
910 return(amount);
911 } 912
913 /* 914 * LISTEN is a special case for select.. 915 */ 916 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 917 { 918 if (sel_type == SEL_IN) { 919 intretval;
920
921 sk->inuse = 1;
922 retval = (tcp_find_established(sk) != NULL);
923 release_sock(sk);
924 if (!retval)
925 select_wait(&master_select_wakeup,wait);
926 returnretval;
927 } 928 return 0;
929 } 930
931
932 /* 933 * Wait for a TCP event. 934 * 935 * Note that we don't need to set "sk->inuse", as the upper select layers 936 * take care of normal races (between the test and the event) and we don't 937 * go look at any of the socket buffers directly. 938 */ 939 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 940 { 941 if (sk->state == TCP_LISTEN)
942 returntcp_listen_select(sk, sel_type, wait);
943
944 switch(sel_type) { 945 caseSEL_IN:
946 if (sk->err)
947 return 1;
948 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
949 break;
950
951 if (sk->shutdown & RCV_SHUTDOWN)
952 return 1;
953
954 if (sk->acked_seq == sk->copied_seq)
955 break;
956
957 if (sk->urg_seq != sk->copied_seq ||
958 sk->acked_seq != sk->copied_seq+1 ||
959 sk->urginline || !sk->urg_data)
960 return 1;
961 break;
962
963 caseSEL_OUT:
964 if (sk->err)
965 return 1;
966 if (sk->shutdown & SEND_SHUTDOWN)
967 return 0;
968 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
969 break;
970 /* 971 * This is now right thanks to a small fix 972 * by Matt Dillon. 973 */ 974
975 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
976 break;
977 return 1;
978
979 caseSEL_EX:
980 if (sk->urg_data)
981 return 1;
982 break;
983 } 984 select_wait(sk->sleep, wait);
985 return 0;
986 } 987
988 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 989 { 990 interr;
991 switch(cmd)
992 { 993
994 caseTIOCINQ:
995 #ifdef FIXME /* FIXME: */ 996 caseFIONREAD:
997 #endif 998 { 999 unsignedlongamount;
1000
1001 if (sk->state == TCP_LISTEN)
1002 return(-EINVAL);
1003
1004 sk->inuse = 1;
1005 amount = tcp_readable(sk);
1006 release_sock(sk);
1007 err=verify_area(VERIFY_WRITE,(void *)arg,
1008 sizeof(unsignedlong));
1009 if(err)
1010 returnerr;
1011 put_fs_long(amount,(unsignedlong *)arg);
1012 return(0);
1013 }1014 caseSIOCATMARK:
1015 {1016 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
1017
1018 err = verify_area(VERIFY_WRITE,(void *) arg,
1019 sizeof(unsignedlong));
1020 if (err)
1021 returnerr;
1022 put_fs_long(answ,(int *) arg);
1023 return(0);
1024 }1025 caseTIOCOUTQ:
1026 {1027 unsignedlongamount;
1028
1029 if (sk->state == TCP_LISTEN) return(-EINVAL);
1030 amount = sk->prot->wspace(sk);
1031 err=verify_area(VERIFY_WRITE,(void *)arg,
1032 sizeof(unsignedlong));
1033 if(err)
1034 returnerr;
1035 put_fs_long(amount,(unsignedlong *)arg);
1036 return(0);
1037 }1038 default:
1039 return(-EINVAL);
1040 }1041 }1042
1043
1044 /*1045 * This routine computes a TCP checksum. 1046 *1047 * Modified January 1995 from a go-faster DOS routine by1048 * Jorge Cwik <jorge@laser.satlink.net>1049 */1050
1051 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1052 unsignedlongsaddr, unsignedlongdaddr)
1053 {1054 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,
1055 csum_partial((char *)th,len,0));
1056 }1057
1058
1059
1060 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1061 unsignedlongdaddr, intlen, structsock *sk)
1062 {1063 th->check = 0;
1064 th->check = tcp_check(th, len, saddr, daddr);
1065 return;
1066 }1067
1068 /*1069 * This is the main buffer sending routine. We queue the buffer1070 * having checked it is sane seeming.1071 */1072
1073 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1074 {1075 intsize;
1076 structtcphdr * th = skb->h.th;
1077
1078 /*1079 * length of packet (not counting length of pre-tcp headers) 1080 */1081
1082 size = skb->len - ((unsignedchar *) th - skb->data);
1083
1084 /*1085 * Sanity check it.. 1086 */1087
1088 if (size < sizeof(structtcphdr) || size > skb->len)
1089 {1090 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1091 skb, skb->data, th, skb->len);
1092 kfree_skb(skb, FREE_WRITE);
1093 return;
1094 }1095
1096 /*1097 * If we have queued a header size packet.. (these crash a few1098 * tcp stacks if ack is not set)1099 */1100
1101 if (size == sizeof(structtcphdr))
1102 {1103 /* If it's got a syn or fin it's notionally included in the size..*/1104 if(!th->syn && !th->fin)
1105 {1106 printk("tcp_send_skb: attempt to queue a bogon.\n");
1107 kfree_skb(skb,FREE_WRITE);
1108 return;
1109 }1110 }1111
1112 /*1113 * Actual processing.1114 */1115
1116 tcp_statistics.TcpOutSegs++;
1117 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1118
1119 /*1120 * We must queue if1121 *1122 * a) The right edge of this frame exceeds the window1123 * b) We are retransmitting (Nagle's rule)1124 * c) We have too many packets 'in flight'1125 */1126
1127 if (after(skb->h.seq, sk->window_seq) ||
1128 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1129 sk->packets_out >= sk->cong_window)
1130 {1131 /* checksum will be supplied by tcp_write_xmit. So1132 * we shouldn't need to set it at all. I'm being paranoid */1133 th->check = 0;
1134 if (skb->next != NULL)
1135 {1136 printk("tcp_send_partial: next != NULL\n");
1137 skb_unlink(skb);
1138 }1139 skb_queue_tail(&sk->write_queue, skb);
1140
1141 /*1142 * If we don't fit we have to start the zero window1143 * probes. This is broken - we really need to do a partial1144 * send _first_ (This is what causes the Cisco and PC/TCP1145 * grief).1146 */1147
1148 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1149 sk->send_head == NULL && sk->ack_backlog == 0)
1150 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1151 }1152 else1153 {1154 /*1155 * This is going straight out1156 */1157
1158 th->ack_seq = ntohl(sk->acked_seq);
1159 th->window = ntohs(tcp_select_window(sk));
1160
1161 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1162
1163 sk->sent_seq = sk->write_seq;
1164
1165 /*1166 * This is mad. The tcp retransmit queue is put together1167 * by the ip layer. This causes half the problems with1168 * unroutable FIN's and other things.1169 */1170
1171 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1172
1173 /*1174 * Set for next retransmit based on expected ACK time.1175 * FIXME: We set this every time which means our 1176 * retransmits are really about a window behind.1177 */1178
1179 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1180 }1181 }1182
1183 /*1184 * Locking problems lead us to a messy situation where we can have1185 * multiple partially complete buffers queued up. This is really bad1186 * as we don't want to be sending partial buffers. Fix this with1187 * a semaphore or similar to lock tcp_write per socket.1188 *1189 * These routines are pretty self descriptive.1190 */1191
1192 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1193 {1194 structsk_buff * skb;
1195 unsignedlongflags;
1196
1197 save_flags(flags);
1198 cli();
1199 skb = sk->partial;
1200 if (skb) {1201 sk->partial = NULL;
1202 del_timer(&sk->partial_timer);
1203 }1204 restore_flags(flags);
1205 returnskb;
1206 }1207
1208 /*1209 * Empty the partial queue1210 */1211
1212 staticvoidtcp_send_partial(structsock *sk)
/* */1213 {1214 structsk_buff *skb;
1215
1216 if (sk == NULL)
1217 return;
1218 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1219 tcp_send_skb(sk, skb);
1220 }1221
1222 /*1223 * Queue a partial frame1224 */1225
1226 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1227 {1228 structsk_buff * tmp;
1229 unsignedlongflags;
1230
1231 save_flags(flags);
1232 cli();
1233 tmp = sk->partial;
1234 if (tmp)
1235 del_timer(&sk->partial_timer);
1236 sk->partial = skb;
1237 init_timer(&sk->partial_timer);
1238 /*1239 * Wait up to 1 second for the buffer to fill.1240 */1241 sk->partial_timer.expires = HZ;
1242 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1243 sk->partial_timer.data = (unsignedlong) sk;
1244 add_timer(&sk->partial_timer);
1245 restore_flags(flags);
1246 if (tmp)
1247 tcp_send_skb(sk, tmp);
1248 }1249
1250
1251 /*1252 * This routine sends an ack and also updates the window. 1253 */1254
1255 staticvoidtcp_send_ack(unsignedlongsequence, unsignedlongack,
/* */1256 structsock *sk,
1257 structtcphdr *th, unsignedlongdaddr)
1258 {1259 structsk_buff *buff;
1260 structtcphdr *t1;
1261 structdevice *dev = NULL;
1262 inttmp;
1263
1264 if(sk->zapped)
1265 return; /* We have been reset, we may not send again */1266
1267 /*1268 * We need to grab some memory, and put together an ack,1269 * and then put it into the queue to be sent.1270 */1271
1272 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1273 if (buff == NULL)
1274 {1275 /* 1276 * Force it to send an ack. We don't have to do this1277 * (ACK is unreliable) but it's much better use of 1278 * bandwidth on slow links to send a spare ack than1279 * resend packets. 1280 */1281
1282 sk->ack_backlog++;
1283 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1284 {1285 reset_xmit_timer(sk, TIME_WRITE, HZ);
1286 }1287 return;
1288 }1289
1290 /*1291 * Assemble a suitable TCP frame1292 */1293
1294 buff->len = sizeof(structtcphdr);
1295 buff->sk = sk;
1296 buff->localroute = sk->localroute;
1297 t1 =(structtcphdr *) buff->data;
1298
1299 /* 1300 * Put in the IP header and routing stuff. 1301 */1302
1303 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1304 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1305 if (tmp < 0)
1306 {1307 buff->free = 1;
1308 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1309 return;
1310 }1311 buff->len += tmp;
1312 t1 =(structtcphdr *)((char *)t1 +tmp);
1313
1314 memcpy(t1, th, sizeof(*t1));
1315
1316 /*1317 * Swap the send and the receive. 1318 */1319
1320 t1->dest = th->source;
1321 t1->source = th->dest;
1322 t1->seq = ntohl(sequence);
1323 t1->ack = 1;
1324 sk->window = tcp_select_window(sk);
1325 t1->window = ntohs(sk->window);
1326 t1->res1 = 0;
1327 t1->res2 = 0;
1328 t1->rst = 0;
1329 t1->urg = 0;
1330 t1->syn = 0;
1331 t1->psh = 0;
1332 t1->fin = 0;
1333
1334 /*1335 * If we have nothing queued for transmit and the transmit timer1336 * is on we are just doing an ACK timeout and need to switch1337 * to a keepalive.1338 */1339
1340 if (ack == sk->acked_seq)
1341 {1342 sk->ack_backlog = 0;
1343 sk->bytes_rcv = 0;
1344 sk->ack_timed = 0;
1345 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1346 && sk->ip_xmit_timeout == TIME_WRITE)
1347 {1348 if(sk->keepopen) {1349 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1350 }else{1351 delete_timer(sk);
1352 }1353 }1354 }1355
1356 /*1357 * Fill in the packet and send it1358 */1359
1360 t1->ack_seq = ntohl(ack);
1361 t1->doff = sizeof(*t1)/4;
1362 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1363 if (sk->debug)
1364 printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1365 tcp_statistics.TcpOutSegs++;
1366 sk->prot->queue_xmit(sk, dev, buff, 1);
1367 }1368
1369
1370 /* 1371 * This routine builds a generic TCP header. 1372 */1373
1374 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1375 {1376
1377 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1378 th->seq = htonl(sk->write_seq);
1379 th->psh =(push == 0) ? 1 : 0;
1380 th->doff = sizeof(*th)/4;
1381 th->ack = 1;
1382 th->fin = 0;
1383 sk->ack_backlog = 0;
1384 sk->bytes_rcv = 0;
1385 sk->ack_timed = 0;
1386 th->ack_seq = htonl(sk->acked_seq);
1387 sk->window = tcp_select_window(sk);
1388 th->window = htons(sk->window);
1389
1390 return(sizeof(*th));
1391 }1392
1393 /*1394 * This routine copies from a user buffer into a socket,1395 * and starts the transmit system.1396 */1397
1398 staticinttcp_write(structsock *sk, unsignedchar *from,
/* */1399 intlen, intnonblock, unsignedflags)
1400 {1401 intcopied = 0;
1402 intcopy;
1403 inttmp;
1404 structsk_buff *skb;
1405 structsk_buff *send_tmp;
1406 unsignedchar *buff;
1407 structproto *prot;
1408 structdevice *dev = NULL;
1409
1410 sk->inuse=1;
1411 prot = sk->prot;
1412 while(len > 0)
1413 {1414 if (sk->err)
1415 {/* Stop on an error */1416 release_sock(sk);
1417 if (copied)
1418 return(copied);
1419 tmp = -sk->err;
1420 sk->err = 0;
1421 return(tmp);
1422 }1423
1424 /*1425 * First thing we do is make sure that we are established. 1426 */1427
1428 if (sk->shutdown & SEND_SHUTDOWN)
1429 {1430 release_sock(sk);
1431 sk->err = EPIPE;
1432 if (copied)
1433 return(copied);
1434 sk->err = 0;
1435 return(-EPIPE);
1436 }1437
1438 /* 1439 * Wait for a connection to finish.1440 */1441
1442 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1443 {1444 if (sk->err)
1445 {1446 release_sock(sk);
1447 if (copied)
1448 return(copied);
1449 tmp = -sk->err;
1450 sk->err = 0;
1451 return(tmp);
1452 }1453
1454 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1455 {1456 release_sock(sk);
1457 if (copied)
1458 return(copied);
1459
1460 if (sk->err)
1461 {1462 tmp = -sk->err;
1463 sk->err = 0;
1464 return(tmp);
1465 }1466
1467 if (sk->keepopen)
1468 {1469 send_sig(SIGPIPE, current, 0);
1470 }1471 return(-EPIPE);
1472 }1473
1474 if (nonblock || copied)
1475 {1476 release_sock(sk);
1477 if (copied)
1478 return(copied);
1479 return(-EAGAIN);
1480 }1481
1482 release_sock(sk);
1483 cli();
1484
1485 if (sk->state != TCP_ESTABLISHED &&
1486 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1487 {1488 interruptible_sleep_on(sk->sleep);
1489 if (current->signal & ~current->blocked)
1490 {1491 sti();
1492 if (copied)
1493 return(copied);
1494 return(-ERESTARTSYS);
1495 }1496 }1497 sk->inuse = 1;
1498 sti();
1499 }1500
1501 /*1502 * The following code can result in copy <= if sk->mss is ever1503 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1504 * sk->mtu is constant once SYN processing is finished. I.e. we1505 * had better not get here until we've seen his SYN and at least one1506 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1507 * But ESTABLISHED should guarantee that. sk->max_window is by definition1508 * non-decreasing. Note that any ioctl to set user_mss must be done1509 * before the exchange of SYN's. If the initial ack from the other1510 * end has a window of 0, max_window and thus mss will both be 0.1511 */1512
1513 /* 1514 * Now we need to check if we have a half built packet. 1515 */1516
1517 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1518 {1519 inthdrlen;
1520
1521 /* IP header + TCP header */1522 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1523 + sizeof(structtcphdr);
1524
1525 /* Add more stuff to the end of skb->len */1526 if (!(flags & MSG_OOB))
1527 {1528 copy = min(sk->mss - (skb->len - hdrlen), len);
1529 /* FIXME: this is really a bug. */1530 if (copy <= 0)
1531 {1532 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1533 copy = 0;
1534 }1535
1536 memcpy_fromfs(skb->data + skb->len, from, copy);
1537 skb->len += copy;
1538 from += copy;
1539 copied += copy;
1540 len -= copy;
1541 sk->write_seq += copy;
1542 }1543 if ((skb->len - hdrlen) >= sk->mss ||
1544 (flags & MSG_OOB) || !sk->packets_out)
1545 tcp_send_skb(sk, skb);
1546 else1547 tcp_enqueue_partial(skb, sk);
1548 continue;
1549 }1550
1551 /*1552 * We also need to worry about the window.1553 * If window < 1/2 the maximum window we've seen from this1554 * host, don't use it. This is sender side1555 * silly window prevention, as specified in RFC1122.1556 * (Note that this is different than earlier versions of1557 * SWS prevention, e.g. RFC813.). What we actually do is 1558 * use the whole MSS. Since the results in the right1559 * edge of the packet being outside the window, it will1560 * be queued for later rather than sent.1561 */1562
1563 copy = sk->window_seq - sk->write_seq;
1564 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1565 copy = sk->mss;
1566 if (copy > len)
1567 copy = len;
1568
1569 /*1570 * We should really check the window here also. 1571 */1572
1573 send_tmp = NULL;
1574 if (copy < sk->mss && !(flags & MSG_OOB))
1575 {1576 /*1577 * We will release the socket in case we sleep here. 1578 */1579 release_sock(sk);
1580 /*1581 * NB: following must be mtu, because mss can be increased.1582 * mss is always <= mtu 1583 */1584 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1585 sk->inuse = 1;
1586 send_tmp = skb;
1587 }1588 else1589 {1590 /*1591 * We will release the socket in case we sleep here. 1592 */1593 release_sock(sk);
1594 skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1595 sk->inuse = 1;
1596 }1597
1598 /*1599 * If we didn't get any memory, we need to sleep. 1600 */1601
1602 if (skb == NULL)
1603 {1604 sk->socket->flags |= SO_NOSPACE;
1605 if (nonblock)
1606 {1607 release_sock(sk);
1608 if (copied)
1609 return(copied);
1610 return(-EAGAIN);
1611 }1612
1613 /*1614 * FIXME: here is another race condition. 1615 */1616
1617 tmp = sk->wmem_alloc;
1618 release_sock(sk);
1619 cli();
1620 /*1621 * Again we will try to avoid it. 1622 */1623 if (tmp <= sk->wmem_alloc &&
1624 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1625 && sk->err == 0)
1626 {1627 sk->socket->flags &= ~SO_NOSPACE;
1628 interruptible_sleep_on(sk->sleep);
1629 if (current->signal & ~current->blocked)
1630 {1631 sti();
1632 if (copied)
1633 return(copied);
1634 return(-ERESTARTSYS);
1635 }1636 }1637 sk->inuse = 1;
1638 sti();
1639 continue;
1640 }1641
1642 skb->len = 0;
1643 skb->sk = sk;
1644 skb->free = 0;
1645 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1646
1647 buff = skb->data;
1648
1649 /*1650 * FIXME: we need to optimize this.1651 * Perhaps some hints here would be good.1652 */1653
1654 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1655 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1656 if (tmp < 0 )
1657 {1658 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1659 release_sock(sk);
1660 if (copied)
1661 return(copied);
1662 return(tmp);
1663 }1664 skb->len += tmp;
1665 skb->dev = dev;
1666 buff += tmp;
1667 skb->h.th =(structtcphdr *) buff;
1668 tmp = tcp_build_header((structtcphdr *)buff, sk, len-copy);
1669 if (tmp < 0)
1670 {1671 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1672 release_sock(sk);
1673 if (copied)
1674 return(copied);
1675 return(tmp);
1676 }1677
1678 if (flags & MSG_OOB)
1679 {1680 ((structtcphdr *)buff)->urg = 1;
1681 ((structtcphdr *)buff)->urg_ptr = ntohs(copy);
1682 }1683 skb->len += tmp;
1684 memcpy_fromfs(buff+tmp, from, copy);
1685
1686 from += copy;
1687 copied += copy;
1688 len -= copy;
1689 skb->len += copy;
1690 skb->free = 0;
1691 sk->write_seq += copy;
1692
1693 if (send_tmp != NULL && sk->packets_out)
1694 {1695 tcp_enqueue_partial(send_tmp, sk);
1696 continue;
1697 }1698 tcp_send_skb(sk, skb);
1699 }1700 sk->err = 0;
1701
1702 /*1703 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1704 * interactive fast network servers. It's meant to be on and1705 * it really improves the throughput though not the echo time1706 * on my slow slip link - Alan1707 */1708
1709 /*1710 * Avoid possible race on send_tmp - c/o Johannes Stille 1711 */1712
1713 if(sk->partial && ((!sk->packets_out)
1714 /* If not nagling we can send on the before case too.. */1715 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1716 ))
1717 tcp_send_partial(sk);
1718
1719 release_sock(sk);
1720 return(copied);
1721 }1722
1723 /*1724 * This is just a wrapper. 1725 */1726
1727 staticinttcp_sendto(structsock *sk, unsignedchar *from,
/* */1728 intlen, intnonblock, unsignedflags,
1729 structsockaddr_in *addr, intaddr_len)
1730 {1731 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1732 return -EINVAL;
1733 if (sk->state == TCP_CLOSE)
1734 return -ENOTCONN;
1735 if (addr_len < sizeof(*addr))
1736 return -EINVAL;
1737 if (addr->sin_family && addr->sin_family != AF_INET)
1738 return -EINVAL;
1739 if (addr->sin_port != sk->dummy_th.dest)
1740 return -EISCONN;
1741 if (addr->sin_addr.s_addr != sk->daddr)
1742 return -EISCONN;
1743 returntcp_write(sk, from, len, nonblock, flags);
1744 }1745
1746
1747 /*1748 * Send an ack if one is backlogged at this point. Ought to merge1749 * this with tcp_send_ack().1750 */1751
1752 staticvoidtcp_read_wakeup(structsock *sk)
/* */1753 {1754 inttmp;
1755 structdevice *dev = NULL;
1756 structtcphdr *t1;
1757 structsk_buff *buff;
1758
1759 if (!sk->ack_backlog)
1760 return;
1761
1762 /*1763 * FIXME: we need to put code here to prevent this routine from1764 * being called. Being called once in a while is ok, so only check1765 * if this is the second time in a row.1766 */1767
1768 /*1769 * We need to grab some memory, and put together an ack,1770 * and then put it into the queue to be sent.1771 */1772
1773 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1774 if (buff == NULL)
1775 {1776 /* Try again real soon. */1777 reset_xmit_timer(sk, TIME_WRITE, HZ);
1778 return;
1779 }1780
1781 buff->len = sizeof(structtcphdr);
1782 buff->sk = sk;
1783 buff->localroute = sk->localroute;
1784
1785 /*1786 * Put in the IP header and routing stuff. 1787 */1788
1789 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1790 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1791 if (tmp < 0)
1792 {1793 buff->free = 1;
1794 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1795 return;
1796 }1797
1798 buff->len += tmp;
1799 t1 =(structtcphdr *)(buff->data +tmp);
1800
1801 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1802 t1->seq = htonl(sk->sent_seq);
1803 t1->ack = 1;
1804 t1->res1 = 0;
1805 t1->res2 = 0;
1806 t1->rst = 0;
1807 t1->urg = 0;
1808 t1->syn = 0;
1809 t1->psh = 0;
1810 sk->ack_backlog = 0;
1811 sk->bytes_rcv = 0;
1812 sk->window = tcp_select_window(sk);
1813 t1->window = ntohs(sk->window);
1814 t1->ack_seq = ntohl(sk->acked_seq);
1815 t1->doff = sizeof(*t1)/4;
1816 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1817 sk->prot->queue_xmit(sk, dev, buff, 1);
1818 tcp_statistics.TcpOutSegs++;
1819 }1820
1821
1822 /*1823 * FIXME:1824 * This routine frees used buffers.1825 * It should consider sending an ACK to let the1826 * other end know we now have a bigger window.1827 */1828
1829 staticvoidcleanup_rbuf(structsock *sk)
/* */1830 {1831 unsignedlongflags;
1832 unsignedlongleft;
1833 structsk_buff *skb;
1834 unsignedlongrspace;
1835
1836 if(sk->debug)
1837 printk("cleaning rbuf for sk=%p\n", sk);
1838
1839 save_flags(flags);
1840 cli();
1841
1842 left = sk->prot->rspace(sk);
1843
1844 /*1845 * We have to loop through all the buffer headers,1846 * and try to free up all the space we can.1847 */1848
1849 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1850 {1851 if (!skb->used || skb->users)
1852 break;
1853 skb_unlink(skb);
1854 skb->sk = sk;
1855 kfree_skb(skb, FREE_READ);
1856 }1857
1858 restore_flags(flags);
1859
1860 /*1861 * FIXME:1862 * At this point we should send an ack if the difference1863 * in the window, and the amount of space is bigger than1864 * TCP_WINDOW_DIFF.1865 */1866
1867 if(sk->debug)
1868 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1869 left);
1870 if ((rspace=sk->prot->rspace(sk)) != left)
1871 {1872 /*1873 * This area has caused the most trouble. The current strategy1874 * is to simply do nothing if the other end has room to send at1875 * least 3 full packets, because the ack from those will auto-1876 * matically update the window. If the other end doesn't think1877 * we have much space left, but we have room for at least 1 more1878 * complete packet than it thinks we do, we will send an ack1879 * immediately. Otherwise we will wait up to .5 seconds in case1880 * the user reads some more.1881 */1882 sk->ack_backlog++;
1883 /*1884 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1885 * if the other end is offering a window smaller than the agreed on MSS1886 * (called sk->mtu here). In theory there's no connection between send1887 * and receive, and so no reason to think that they're going to send1888 * small packets. For the moment I'm using the hack of reducing the mss1889 * only on the send side, so I'm putting mtu here.1890 */1891
1892 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1893 {1894 /* Send an ack right now. */1895 tcp_read_wakeup(sk);
1896 }1897 else1898 {1899 /* Force it to send an ack soon. */1900 intwas_active = del_timer(&sk->retransmit_timer);
1901 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1902 {1903 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1904 }1905 else1906 add_timer(&sk->retransmit_timer);
1907 }1908 }1909 }1910
1911
1912 /*1913 * Handle reading urgent data. BSD has very simple semantics for1914 * this, no blocking and very strange errors 8)1915 */1916
1917 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */1918 unsignedchar *to, intlen, unsignedflags)
1919 {1920 /*1921 * No URG data to read1922 */1923 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1924 return -EINVAL; /* Yes this is right ! */1925
1926 if (sk->err)
1927 {1928 inttmp = -sk->err;
1929 sk->err = 0;
1930 returntmp;
1931 }1932
1933 if (sk->state == TCP_CLOSE || sk->done)
1934 {1935 if (!sk->done) {1936 sk->done = 1;
1937 return 0;
1938 }1939 return -ENOTCONN;
1940 }1941
1942 if (sk->shutdown & RCV_SHUTDOWN)
1943 {1944 sk->done = 1;
1945 return 0;
1946 }1947 sk->inuse = 1;
1948 if (sk->urg_data & URG_VALID)
1949 {1950 charc = sk->urg_data;
1951 if (!(flags & MSG_PEEK))
1952 sk->urg_data = URG_READ;
1953 put_fs_byte(c, to);
1954 release_sock(sk);
1955 return 1;
1956 }1957 release_sock(sk);
1958
1959 /*1960 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and1961 * the available implementations agree in this case:1962 * this call should never block, independent of the1963 * blocking state of the socket.1964 * Mike <pall@rz.uni-karlsruhe.de>1965 */1966 return -EAGAIN;
1967 }1968
1969
1970 /*1971 * This routine copies from a sock struct into the user buffer. 1972 */1973
1974 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */1975 intlen, intnonblock, unsignedflags)
1976 {1977 structwait_queuewait = {current, NULL};
1978 intcopied = 0;
1979 unsignedlongpeek_seq;
1980 volatileunsignedlong *seq; /* So gcc doesn't overoptimise */1981 unsignedlongused;
1982
1983 /* 1984 * This error should be checked. 1985 */1986
1987 if (sk->state == TCP_LISTEN)
1988 return -ENOTCONN;
1989
1990 /*1991 * Urgent data needs to be handled specially. 1992 */1993
1994 if (flags & MSG_OOB)
1995 returntcp_read_urg(sk, nonblock, to, len, flags);
1996
1997 /*1998 * Copying sequence to update. This is volatile to handle1999 * the multi-reader case neatly (memcpy_to/fromfs might be 2000 * inline and thus not flush cached variables otherwise).2001 */2002
2003 peek_seq = sk->copied_seq;
2004 seq = &sk->copied_seq;
2005 if (flags & MSG_PEEK)
2006 seq = &peek_seq;
2007
2008 add_wait_queue(sk->sleep, &wait);
2009 sk->inuse = 1;
2010 while (len > 0)
2011 {2012 structsk_buff * skb;
2013 unsignedlongoffset;
2014
2015 /*2016 * Are we at urgent data? Stop if we have read anything.2017 */2018
2019 if (copied && sk->urg_data && sk->urg_seq == *seq)
2020 break;
2021
2022 /*2023 * Next get a buffer.2024 */2025
2026 current->state = TASK_INTERRUPTIBLE;
2027
2028 skb = skb_peek(&sk->receive_queue);
2029 do2030 {2031 if (!skb)
2032 break;
2033 if (before(*seq, skb->h.th->seq))
2034 break;
2035 offset = *seq - skb->h.th->seq;
2036 if (skb->h.th->syn)
2037 offset--;
2038 if (offset < skb->len)
2039 gotofound_ok_skb;
2040 if (skb->h.th->fin)
2041 gotofound_fin_ok;
2042 if (!(flags & MSG_PEEK))
2043 skb->used = 1;
2044 skb = skb->next;
2045 }2046 while (skb != (structsk_buff *)&sk->receive_queue);
2047
2048 if (copied)
2049 break;
2050
2051 if (sk->err)
2052 {2053 copied = -sk->err;
2054 sk->err = 0;
2055 break;
2056 }2057
2058 if (sk->state == TCP_CLOSE)
2059 {2060 if (!sk->done)
2061 {2062 sk->done = 1;
2063 break;
2064 }2065 copied = -ENOTCONN;
2066 break;
2067 }2068
2069 if (sk->shutdown & RCV_SHUTDOWN)
2070 {2071 sk->done = 1;
2072 break;
2073 }2074
2075 if (nonblock)
2076 {2077 copied = -EAGAIN;
2078 break;
2079 }2080
2081 cleanup_rbuf(sk);
2082 release_sock(sk);
2083 sk->socket->flags |= SO_WAITDATA;
2084 schedule();
2085 sk->socket->flags &= ~SO_WAITDATA;
2086 sk->inuse = 1;
2087
2088 if (current->signal & ~current->blocked)
2089 {2090 copied = -ERESTARTSYS;
2091 break;
2092 }2093 continue;
2094
2095 found_ok_skb:
2096 /*2097 * Lock the buffer. We can be fairly relaxed as2098 * an interrupt will never steal a buffer we are 2099 * using unless I've missed something serious in2100 * tcp_data.2101 */2102
2103 skb->users++;
2104
2105 /*2106 * Ok so how much can we use ? 2107 */2108
2109 used = skb->len - offset;
2110 if (len < used)
2111 used = len;
2112 /*2113 * Do we have urgent data here? 2114 */2115
2116 if (sk->urg_data)
2117 {2118 unsignedlongurg_offset = sk->urg_seq - *seq;
2119 if (urg_offset < used)
2120 {2121 if (!urg_offset)
2122 {2123 if (!sk->urginline)
2124 {2125 ++*seq;
2126 offset++;
2127 used--;
2128 }2129 }2130 else2131 used = urg_offset;
2132 }2133 }2134
2135 /*2136 * Copy it - We _MUST_ update *seq first so that we2137 * don't ever double read when we have dual readers2138 */2139
2140 *seq += used;
2141
2142 /*2143 * This memcpy_tofs can sleep. If it sleeps and we2144 * do a second read it relies on the skb->users to avoid2145 * a crash when cleanup_rbuf() gets called.2146 */2147
2148 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2149 skb->h.th->doff*4 + offset, used);
2150 copied += used;
2151 len -= used;
2152 to += used;
2153
2154 /*2155 * We now will not sleep again until we are finished2156 * with skb. Sorry if you are doing the SMP port2157 * but you'll just have to fix it neatly ;)2158 */2159
2160 skb->users --;
2161
2162 if (after(sk->copied_seq,sk->urg_seq))
2163 sk->urg_data = 0;
2164 if (used + offset < skb->len)
2165 continue;
2166
2167 /*2168 * Process the FIN.2169 */2170
2171 if (skb->h.th->fin)
2172 gotofound_fin_ok;
2173 if (flags & MSG_PEEK)
2174 continue;
2175 skb->used = 1;
2176 continue;
2177
2178 found_fin_ok:
2179 ++*seq;
2180 if (flags & MSG_PEEK)
2181 break;
2182
2183 /*2184 * All is done2185 */2186
2187 skb->used = 1;
2188 sk->shutdown |= RCV_SHUTDOWN;
2189 break;
2190
2191 }2192 remove_wait_queue(sk->sleep, &wait);
2193 current->state = TASK_RUNNING;
2194
2195 /* Clean up data we have read: This will do ACK frames */2196 cleanup_rbuf(sk);
2197 release_sock(sk);
2198 returncopied;
2199 }2200
2201 /*2202 * State processing on a close. This implements the state shift for2203 * sending our FIN frame. Note that we only send a FIN for some 2204 * states. A shutdown() may have already sent the FIN, or we may be2205 * closed.2206 */2207
2208 staticinttcp_close_state(structsock *sk, intdead)
/* */2209 {2210 intns=TCP_CLOSE;
2211 intsend_fin=0;
2212 switch(sk->state)
2213 {2214 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2215 break;
2216 caseTCP_SYN_RECV:
2217 caseTCP_ESTABLISHED: /* Closedown begin */2218 ns=TCP_FIN_WAIT1;
2219 send_fin=1;
2220 break;
2221 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2222 caseTCP_FIN_WAIT2:
2223 caseTCP_CLOSING:
2224 ns=sk->state;
2225 break;
2226 caseTCP_CLOSE:
2227 caseTCP_LISTEN:
2228 break;
2229 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2230 wait only for the ACK */2231 ns=TCP_LAST_ACK;
2232 send_fin=1;
2233 }2234
2235 tcp_set_state(sk,ns);
2236
2237 /*2238 * This is a (useful) BSD violating of the RFC. There is a2239 * problem with TCP as specified in that the other end could2240 * keep a socket open forever with no application left this end.2241 * We use a 3 minute timeout (about the same as BSD) then kill2242 * our end. If they send after that then tough - BUT: long enough2243 * that we won't make the old 4*rto = almost no time - whoops2244 * reset mistake.2245 */2246 if(dead && ns==TCP_FIN_WAIT2)
2247 {2248 inttimer_active=del_timer(&sk->timer);
2249 if(timer_active)
2250 add_timer(&sk->timer);
2251 else2252 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2253 }2254
2255 returnsend_fin;
2256 }2257
2258 /*2259 * Send a fin.2260 */2261
2262 staticvoidtcp_send_fin(structsock *sk)
/* */2263 {2264 structproto *prot =(structproto *)sk->prot;
2265 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2266 structtcphdr *t1;
2267 structsk_buff *buff;
2268 structdevice *dev=NULL;
2269 inttmp;
2270
2271 release_sock(sk); /* in case the malloc sleeps. */2272
2273 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2274 sk->inuse = 1;
2275
2276 if (buff == NULL)
2277 {2278 /* This is a disaster if it occurs */2279 printk("tcp_send_fin: Impossible malloc failure");
2280 return;
2281 }2282
2283 /*2284 * Administrivia2285 */2286
2287 buff->sk = sk;
2288 buff->len = sizeof(*t1);
2289 buff->localroute = sk->localroute;
2290 t1 =(structtcphdr *) buff->data;
2291
2292 /*2293 * Put in the IP header and routing stuff. 2294 */2295
2296 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2297 IPPROTO_TCP, sk->opt,
2298 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2299 if (tmp < 0)
2300 {2301 intt;
2302 /*2303 * Finish anyway, treat this as a send that got lost. 2304 * (Not good).2305 */2306
2307 buff->free = 1;
2308 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2309 sk->write_seq++;
2310 t=del_timer(&sk->timer);
2311 if(t)
2312 add_timer(&sk->timer);
2313 else2314 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2315 return;
2316 }2317
2318 /*2319 * We ought to check if the end of the queue is a buffer and2320 * if so simply add the fin to that buffer, not send it ahead.2321 */2322
2323 t1 =(structtcphdr *)((char *)t1 +tmp);
2324 buff->len += tmp;
2325 buff->dev = dev;
2326 memcpy(t1, th, sizeof(*t1));
2327 t1->seq = ntohl(sk->write_seq);
2328 sk->write_seq++;
2329 buff->h.seq = sk->write_seq;
2330 t1->ack = 1;
2331 t1->ack_seq = ntohl(sk->acked_seq);
2332 t1->window = ntohs(sk->window=tcp_select_window(sk));
2333 t1->fin = 1;
2334 t1->rst = 0;
2335 t1->doff = sizeof(*t1)/4;
2336 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2337
2338 /*2339 * If there is data in the write queue, the fin must be appended to2340 * the write queue.2341 */2342
2343 if (skb_peek(&sk->write_queue) != NULL)
2344 {2345 buff->free = 0;
2346 if (buff->next != NULL)
2347 {2348 printk("tcp_send_fin: next != NULL\n");
2349 skb_unlink(buff);
2350 }2351 skb_queue_tail(&sk->write_queue, buff);
2352 }2353 else2354 {2355 sk->sent_seq = sk->write_seq;
2356 sk->prot->queue_xmit(sk, dev, buff, 0);
2357 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2358 }2359 }2360
2361 /*2362 * Shutdown the sending side of a connection. Much like close except2363 * that we don't receive shut down or set sk->dead=1.2364 */2365
2366 voidtcp_shutdown(structsock *sk, inthow)
/* */2367 {2368 /*2369 * We need to grab some memory, and put together a FIN,2370 * and then put it into the queue to be sent.2371 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2372 */2373
2374 if (!(how & SEND_SHUTDOWN))
2375 return;
2376
2377 /*2378 * If we've already sent a FIN, or it's a closed state2379 */2380
2381 if (sk->state == TCP_FIN_WAIT1 ||
2382 sk->state == TCP_FIN_WAIT2 ||
2383 sk->state == TCP_CLOSING ||
2384 sk->state == TCP_LAST_ACK ||
2385 sk->state == TCP_TIME_WAIT ||
2386 sk->state == TCP_CLOSE ||
2387 sk->state == TCP_LISTEN2388 )
2389 {2390 return;
2391 }2392 sk->inuse = 1;
2393
2394 /*2395 * flag that the sender has shutdown2396 */2397
2398 sk->shutdown |= SEND_SHUTDOWN;
2399
2400 /*2401 * Clear out any half completed packets. 2402 */2403
2404 if (sk->partial)
2405 tcp_send_partial(sk);
2406
2407 /*2408 * FIN if needed2409 */2410
2411 if(tcp_close_state(sk,0))
2412 tcp_send_fin(sk);
2413
2414 release_sock(sk);
2415 }2416
2417
2418 staticint2419 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2420 intto_len, intnonblock, unsignedflags,
2421 structsockaddr_in *addr, int *addr_len)
2422 {2423 intresult;
2424
2425 /* 2426 * Have to check these first unlike the old code. If 2427 * we check them after we lose data on an error2428 * which is wrong 2429 */2430
2431 if(addr_len)
2432 *addr_len = sizeof(*addr);
2433 result=tcp_read(sk, to, to_len, nonblock, flags);
2434
2435 if (result < 0)
2436 return(result);
2437
2438 if(addr)
2439 {2440 addr->sin_family = AF_INET;
2441 addr->sin_port = sk->dummy_th.dest;
2442 addr->sin_addr.s_addr = sk->daddr;
2443 }2444 return(result);
2445 }2446
2447
2448 /*2449 * This routine will send an RST to the other tcp. 2450 */2451
2452 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2453 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2454 {2455 structsk_buff *buff;
2456 structtcphdr *t1;
2457 inttmp;
2458 structdevice *ndev=NULL;
2459
2460 /*2461 * Cannot reset a reset (Think about it).2462 */2463
2464 if(th->rst)
2465 return;
2466
2467 /*2468 * We need to grab some memory, and put together an RST,2469 * and then put it into the queue to be sent.2470 */2471
2472 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2473 if (buff == NULL)
2474 return;
2475
2476 buff->len = sizeof(*t1);
2477 buff->sk = NULL;
2478 buff->dev = dev;
2479 buff->localroute = 0;
2480
2481 t1 =(structtcphdr *) buff->data;
2482
2483 /*2484 * Put in the IP header and routing stuff. 2485 */2486
2487 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2488 sizeof(structtcphdr),tos,ttl);
2489 if (tmp < 0)
2490 {2491 buff->free = 1;
2492 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2493 return;
2494 }2495
2496 t1 =(structtcphdr *)((char *)t1 +tmp);
2497 buff->len += tmp;
2498 memcpy(t1, th, sizeof(*t1));
2499
2500 /*2501 * Swap the send and the receive. 2502 */2503
2504 t1->dest = th->source;
2505 t1->source = th->dest;
2506 t1->rst = 1;
2507 t1->window = 0;
2508
2509 if(th->ack)
2510 {2511 t1->ack = 0;
2512 t1->seq = th->ack_seq;
2513 t1->ack_seq = 0;
2514 }2515 else2516 {2517 t1->ack = 1;
2518 if(!th->syn)
2519 t1->ack_seq=htonl(th->seq);
2520 else2521 t1->ack_seq=htonl(th->seq+1);
2522 t1->seq=0;
2523 }2524
2525 t1->syn = 0;
2526 t1->urg = 0;
2527 t1->fin = 0;
2528 t1->psh = 0;
2529 t1->doff = sizeof(*t1)/4;
2530 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2531 prot->queue_xmit(NULL, ndev, buff, 1);
2532 tcp_statistics.TcpOutSegs++;
2533 }2534
2535
2536 /*2537 * Look for tcp options. Parses everything but only knows about MSS.2538 * This routine is always called with the packet containing the SYN.2539 * However it may also be called with the ack to the SYN. So you2540 * can't assume this is always the SYN. It's always called after2541 * we have set up sk->mtu to our own MTU.2542 *2543 * We need at minimum to add PAWS support here. Possibly large windows2544 * as Linux gets deployed on 100Mb/sec networks.2545 */2546
2547 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2548 {2549 unsignedchar *ptr;
2550 intlength=(th->doff*4)-sizeof(structtcphdr);
2551 intmss_seen = 0;
2552
2553 ptr = (unsignedchar *)(th + 1);
2554
2555 while(length>0)
2556 {2557 intopcode=*ptr++;
2558 intopsize=*ptr++;
2559 switch(opcode)
2560 {2561 caseTCPOPT_EOL:
2562 return;
2563 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2564 length--;
2565 ptr--; /* the opsize=*ptr++ above was a mistake */2566 continue;
2567
2568 default:
2569 if(opsize<=2) /* Avoid silly options looping forever */2570 return;
2571 switch(opcode)
2572 {2573 caseTCPOPT_MSS:
2574 if(opsize==4 && th->syn)
2575 {2576 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2577 mss_seen = 1;
2578 }2579 break;
2580 /* Add other options here as people feel the urge to implement stuff like large windows */2581 }2582 ptr+=opsize-2;
2583 length-=opsize;
2584 }2585 }2586 if (th->syn)
2587 {2588 if (! mss_seen)
2589 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2590 }2591 #ifdefCONFIG_INET_PCTCP2592 sk->mss = min(sk->max_window >> 1, sk->mtu);
2593 #else2594 sk->mss = min(sk->max_window, sk->mtu);
2595 #endif2596 }2597
2598 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2599 {2600 dst = ntohl(dst);
2601 if (IN_CLASSA(dst))
2602 returnhtonl(IN_CLASSA_NET);
2603 if (IN_CLASSB(dst))
2604 returnhtonl(IN_CLASSB_NET);
2605 returnhtonl(IN_CLASSC_NET);
2606 }2607
2608 /*2609 * Default sequence number picking algorithm.2610 * As close as possible to RFC 793, which2611 * suggests using a 250kHz clock.2612 * Further reading shows this assumes 2MB/s networks.2613 * For 10MB/s ethernet, a 1MHz clock is appropriate.2614 * That's funny, Linux has one built in! Use it!2615 */2616
2617 externinlineunsignedlongtcp_init_seq(void)
/* */2618 {2619 structtimevaltv;
2620 do_gettimeofday(&tv);
2621 returntv.tv_usec+tv.tv_sec*1000000;
2622 }2623
2624 /*2625 * This routine handles a connection request.2626 * It should make sure we haven't already responded.2627 * Because of the way BSD works, we have to send a syn/ack now.2628 * This also means it will be harder to close a socket which is2629 * listening.2630 */2631
2632 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2633 unsignedlongdaddr, unsignedlongsaddr,
2634 structoptions *opt, structdevice *dev, unsignedlongseq)
2635 {2636 structsk_buff *buff;
2637 structtcphdr *t1;
2638 unsignedchar *ptr;
2639 structsock *newsk;
2640 structtcphdr *th;
2641 structdevice *ndev=NULL;
2642 inttmp;
2643 structrtable *rt;
2644
2645 th = skb->h.th;
2646
2647 /* If the socket is dead, don't accept the connection. */2648 if (!sk->dead)
2649 {2650 sk->data_ready(sk,0);
2651 }2652 else2653 {2654 if(sk->debug)
2655 printk("Reset on %p: Connect on dead socket.\n",sk);
2656 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2657 tcp_statistics.TcpAttemptFails++;
2658 kfree_skb(skb, FREE_READ);
2659 return;
2660 }2661
2662 /*2663 * Make sure we can accept more. This will prevent a2664 * flurry of syns from eating up all our memory.2665 */2666
2667 if (sk->ack_backlog >= sk->max_ack_backlog)
2668 {2669 tcp_statistics.TcpAttemptFails++;
2670 kfree_skb(skb, FREE_READ);
2671 return;
2672 }2673
2674 /*2675 * We need to build a new sock struct.2676 * It is sort of bad to have a socket without an inode attached2677 * to it, but the wake_up's will just wake up the listening socket,2678 * and if the listening socket is destroyed before this is taken2679 * off of the queue, this will take care of it.2680 */2681
2682 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2683 if (newsk == NULL)
2684 {2685 /* just ignore the syn. It will get retransmitted. */2686 tcp_statistics.TcpAttemptFails++;
2687 kfree_skb(skb, FREE_READ);
2688 return;
2689 }2690
2691 memcpy(newsk, sk, sizeof(*newsk));
2692 skb_queue_head_init(&newsk->write_queue);
2693 skb_queue_head_init(&newsk->receive_queue);
2694 newsk->send_head = NULL;
2695 newsk->send_tail = NULL;
2696 skb_queue_head_init(&newsk->back_log);
2697 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2698 newsk->rto = TCP_TIMEOUT_INIT;
2699 newsk->mdev = 0;
2700 newsk->max_window = 0;
2701 newsk->cong_window = 1;
2702 newsk->cong_count = 0;
2703 newsk->ssthresh = 0;
2704 newsk->backoff = 0;
2705 newsk->blog = 0;
2706 newsk->intr = 0;
2707 newsk->proc = 0;
2708 newsk->done = 0;
2709 newsk->partial = NULL;
2710 newsk->pair = NULL;
2711 newsk->wmem_alloc = 0;
2712 newsk->rmem_alloc = 0;
2713 newsk->localroute = sk->localroute;
2714
2715 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2716
2717 newsk->err = 0;
2718 newsk->shutdown = 0;
2719 newsk->ack_backlog = 0;
2720 newsk->acked_seq = skb->h.th->seq+1;
2721 newsk->copied_seq = skb->h.th->seq+1;
2722 newsk->fin_seq = skb->h.th->seq;
2723 newsk->state = TCP_SYN_RECV;
2724 newsk->timeout = 0;
2725 newsk->ip_xmit_timeout = 0;
2726 newsk->write_seq = seq;
2727 newsk->window_seq = newsk->write_seq;
2728 newsk->rcv_ack_seq = newsk->write_seq;
2729 newsk->urg_data = 0;
2730 newsk->retransmits = 0;
2731 newsk->linger=0;
2732 newsk->destroy = 0;
2733 init_timer(&newsk->timer);
2734 newsk->timer.data = (unsignedlong)newsk;
2735 newsk->timer.function = &net_timer;
2736 init_timer(&newsk->retransmit_timer);
2737 newsk->retransmit_timer.data = (unsignedlong)newsk;
2738 newsk->retransmit_timer.function=&retransmit_timer;
2739 newsk->dummy_th.source = skb->h.th->dest;
2740 newsk->dummy_th.dest = skb->h.th->source;
2741
2742 /*2743 * Swap these two, they are from our point of view. 2744 */2745
2746 newsk->daddr = saddr;
2747 newsk->saddr = daddr;
2748
2749 put_sock(newsk->num,newsk);
2750 newsk->dummy_th.res1 = 0;
2751 newsk->dummy_th.doff = 6;
2752 newsk->dummy_th.fin = 0;
2753 newsk->dummy_th.syn = 0;
2754 newsk->dummy_th.rst = 0;
2755 newsk->dummy_th.psh = 0;
2756 newsk->dummy_th.ack = 0;
2757 newsk->dummy_th.urg = 0;
2758 newsk->dummy_th.res2 = 0;
2759 newsk->acked_seq = skb->h.th->seq + 1;
2760 newsk->copied_seq = skb->h.th->seq + 1;
2761 newsk->socket = NULL;
2762
2763 /*2764 * Grab the ttl and tos values and use them 2765 */2766
2767 newsk->ip_ttl=sk->ip_ttl;
2768 newsk->ip_tos=skb->ip_hdr->tos;
2769
2770 /*2771 * Use 512 or whatever user asked for 2772 */2773
2774 /*2775 * Note use of sk->user_mss, since user has no direct access to newsk 2776 */2777
2778 rt=ip_rt_route(saddr, NULL,NULL);
2779
2780 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2781 newsk->window_clamp = rt->rt_window;
2782 else2783 newsk->window_clamp = 0;
2784
2785 if (sk->user_mss)
2786 newsk->mtu = sk->user_mss;
2787 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
2788 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2789 else2790 {2791 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */2792 if ((saddr ^ daddr) & default_mask(saddr))
2793 #else2794 if ((saddr ^ daddr) & dev->pa_mask)
2795 #endif2796 newsk->mtu = 576 - HEADER_SIZE;
2797 else2798 newsk->mtu = MAX_WINDOW;
2799 }2800
2801 /*2802 * But not bigger than device MTU 2803 */2804
2805 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2806
2807 /*2808 * This will min with what arrived in the packet 2809 */2810
2811 tcp_options(newsk,skb->h.th);
2812
2813 tcp_cache_zap();
2814
2815 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2816 if (buff == NULL)
2817 {2818 sk->err = ENOMEM;
2819 newsk->dead = 1;
2820 newsk->state = TCP_CLOSE;
2821 /* And this will destroy it */2822 release_sock(newsk);
2823 kfree_skb(skb, FREE_READ);
2824 tcp_statistics.TcpAttemptFails++;
2825 return;
2826 }2827
2828 buff->len = sizeof(structtcphdr)+4;
2829 buff->sk = newsk;
2830 buff->localroute = newsk->localroute;
2831
2832 t1 =(structtcphdr *) buff->data;
2833
2834 /*2835 * Put in the IP header and routing stuff. 2836 */2837
2838 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2839 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2840
2841 /*2842 * Something went wrong. 2843 */2844
2845 if (tmp < 0)
2846 {2847 sk->err = tmp;
2848 buff->free = 1;
2849 kfree_skb(buff,FREE_WRITE);
2850 newsk->dead = 1;
2851 newsk->state = TCP_CLOSE;
2852 release_sock(newsk);
2853 skb->sk = sk;
2854 kfree_skb(skb, FREE_READ);
2855 tcp_statistics.TcpAttemptFails++;
2856 return;
2857 }2858
2859 buff->len += tmp;
2860 t1 =(structtcphdr *)((char *)t1 +tmp);
2861
2862 memcpy(t1, skb->h.th, sizeof(*t1));
2863 buff->h.seq = newsk->write_seq;
2864 /*2865 * Swap the send and the receive. 2866 */2867 t1->dest = skb->h.th->source;
2868 t1->source = newsk->dummy_th.source;
2869 t1->seq = ntohl(newsk->write_seq++);
2870 t1->ack = 1;
2871 newsk->window = tcp_select_window(newsk);
2872 newsk->sent_seq = newsk->write_seq;
2873 t1->window = ntohs(newsk->window);
2874 t1->res1 = 0;
2875 t1->res2 = 0;
2876 t1->rst = 0;
2877 t1->urg = 0;
2878 t1->psh = 0;
2879 t1->syn = 1;
2880 t1->ack_seq = ntohl(skb->h.th->seq+1);
2881 t1->doff = sizeof(*t1)/4+1;
2882 ptr =(unsignedchar *)(t1+1);
2883 ptr[0] = 2;
2884 ptr[1] = 4;
2885 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2886 ptr[3] =(newsk->mtu) & 0xff;
2887
2888 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2889 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2890 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2891 skb->sk = newsk;
2892
2893 /*2894 * Charge the sock_buff to newsk. 2895 */2896
2897 sk->rmem_alloc -= skb->mem_len;
2898 newsk->rmem_alloc += skb->mem_len;
2899
2900 skb_queue_tail(&sk->receive_queue,skb);
2901 sk->ack_backlog++;
2902 release_sock(newsk);
2903 tcp_statistics.TcpOutSegs++;
2904 }2905
2906
2907 staticvoidtcp_close(structsock *sk, inttimeout)
/* */2908 {2909 /*2910 * We need to grab some memory, and put together a FIN, 2911 * and then put it into the queue to be sent.2912 */2913
2914 sk->inuse = 1;
2915
2916 if(th_cache_sk==sk)
2917 tcp_cache_zap();
2918 if(sk->state == TCP_LISTEN)
2919 {2920 /* Special case */2921 tcp_set_state(sk, TCP_CLOSE);
2922 tcp_close_pending(sk);
2923 release_sock(sk);
2924 return;
2925 }2926
2927 sk->keepopen = 1;
2928 sk->shutdown = SHUTDOWN_MASK;
2929
2930 if (!sk->dead)
2931 sk->state_change(sk);
2932
2933 if (timeout == 0)
2934 {2935 structsk_buff *skb;
2936
2937 /*2938 * We need to flush the recv. buffs. We do this only on the2939 * descriptor close, not protocol-sourced closes, because the2940 * reader process may not have drained the data yet!2941 */2942
2943 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2944 kfree_skb(skb, FREE_READ);
2945 /*2946 * Get rid off any half-completed packets. 2947 */2948
2949 if (sk->partial)
2950 tcp_send_partial(sk);
2951 }2952
2953
2954 /*2955 * Timeout is not the same thing - however the code likes2956 * to send both the same way (sigh).2957 */2958
2959 if(timeout)
2960 {2961 tcp_set_state(sk, TCP_CLOSE); /* Dead */2962 }2963 else2964 {2965 if(tcp_close_state(sk,1)==1)
2966 {2967 tcp_send_fin(sk);
2968 }2969 }2970 release_sock(sk);
2971 }2972
2973
2974 /*2975 * This routine takes stuff off of the write queue,2976 * and puts it in the xmit queue. This happens as incoming acks2977 * open up the remote window for us.2978 */2979
2980 staticvoidtcp_write_xmit(structsock *sk)
/* */2981 {2982 structsk_buff *skb;
2983
2984 /*2985 * The bytes will have to remain here. In time closedown will2986 * empty the write queue and all will be happy 2987 */2988
2989 if(sk->zapped)
2990 return;
2991
2992 /*2993 * Anything on the transmit queue that fits the window can2994 * be added providing we are not2995 *2996 * a) retransmitting (Nagle's rule)2997 * b) exceeding our congestion window.2998 */2999
3000 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3001 before(skb->h.seq, sk->window_seq + 1) &&
3002 (sk->retransmits == 0 ||
3003 sk->ip_xmit_timeout != TIME_WRITE ||
3004 before(skb->h.seq, sk->rcv_ack_seq + 1))
3005 && sk->packets_out < sk->cong_window)
3006 {3007 IS_SKB(skb);
3008 skb_unlink(skb);
3009
3010 /*3011 * See if we really need to send the packet. 3012 */3013
3014 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3015 {3016 /*3017 * This is acked data. We can discard it. This 3018 * cannot currently occur.3019 */3020
3021 sk->retransmits = 0;
3022 kfree_skb(skb, FREE_WRITE);
3023 if (!sk->dead)
3024 sk->write_space(sk);
3025 }3026 else3027 {3028 structtcphdr *th;
3029 structiphdr *iph;
3030 intsize;
3031 /*3032 * put in the ack seq and window at this point rather than earlier,3033 * in order to keep them monotonic. We really want to avoid taking3034 * back window allocations. That's legal, but RFC1122 says it's frowned on.3035 * Ack and window will in general have changed since this packet was put3036 * on the write queue.3037 */3038 iph = (structiphdr *)(skb->data +
3039 skb->dev->hard_header_len);
3040 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3041 size = skb->len - (((unsignedchar *) th) - skb->data);
3042
3043 th->ack_seq = ntohl(sk->acked_seq);
3044 th->window = ntohs(tcp_select_window(sk));
3045
3046 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3047
3048 sk->sent_seq = skb->h.seq;
3049
3050 /*3051 * IP manages our queue for some crazy reason3052 */3053
3054 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3055
3056 /*3057 * Again we slide the timer wrongly3058 */3059
3060 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3061 }3062 }3063 }3064
3065
3066 /*3067 * This routine deals with incoming acks, but not outgoing ones.3068 */3069
3070 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3071 {3072 unsignedlongack;
3073 intflag = 0;
3074
3075 /* 3076 * 1 - there was data in packet as well as ack or new data is sent or 3077 * in shutdown state3078 * 2 - data from retransmit queue was acked and removed3079 * 4 - window shrunk or data from retransmit queue was acked and removed3080 */3081
3082 if(sk->zapped)
3083 return(1); /* Dead, cant ack any more so why bother */3084
3085 /*3086 * Have we discovered a larger window3087 */3088
3089 ack = ntohl(th->ack_seq);
3090
3091 if (ntohs(th->window) > sk->max_window)
3092 {3093 sk->max_window = ntohs(th->window);
3094 #ifdefCONFIG_INET_PCTCP3095 /* Hack because we don't send partial packets to non SWS3096 handling hosts */3097 sk->mss = min(sk->max_window>>1, sk->mtu);
3098 #else3099 sk->mss = min(sk->max_window, sk->mtu);
3100 #endif3101 }3102
3103 /*3104 * We have dropped back to keepalive timeouts. Thus we have3105 * no retransmits pending.3106 */3107
3108 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3109 sk->retransmits = 0;
3110
3111 /*3112 * If the ack is newer than sent or older than previous acks3113 * then we can probably ignore it.3114 */3115
3116 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3117 {3118 if(sk->debug)
3119 printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3120
3121 /*3122 * Keepalive processing.3123 */3124
3125 if (after(ack, sk->sent_seq))
3126 {3127 return(0);
3128 }3129
3130 /*3131 * Restart the keepalive timer.3132 */3133
3134 if (sk->keepopen)
3135 {3136 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3137 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3138 }3139 return(1);
3140 }3141
3142 /*3143 * If there is data set flag 13144 */3145
3146 if (len != th->doff*4)
3147 flag |= 1;
3148
3149 /*3150 * See if our window has been shrunk. 3151 */3152
3153 if (after(sk->window_seq, ack+ntohs(th->window)))
3154 {3155 /*3156 * We may need to move packets from the send queue3157 * to the write queue, if the window has been shrunk on us.3158 * The RFC says you are not allowed to shrink your window3159 * like this, but if the other end does, you must be able3160 * to deal with it.3161 */3162 structsk_buff *skb;
3163 structsk_buff *skb2;
3164 structsk_buff *wskb = NULL;
3165
3166 skb2 = sk->send_head;
3167 sk->send_head = NULL;
3168 sk->send_tail = NULL;
3169
3170 /*3171 * This is an artifact of a flawed concept. We want one3172 * queue and a smarter send routine when we send all.3173 */3174
3175 flag |= 4; /* Window changed */3176
3177 sk->window_seq = ack + ntohs(th->window);
3178 cli();
3179 while (skb2 != NULL)
3180 {3181 skb = skb2;
3182 skb2 = skb->link3;
3183 skb->link3 = NULL;
3184 if (after(skb->h.seq, sk->window_seq))
3185 {3186 if (sk->packets_out > 0)
3187 sk->packets_out--;
3188 /* We may need to remove this from the dev send list. */3189 if (skb->next != NULL)
3190 {3191 skb_unlink(skb);
3192 }3193 /* Now add it to the write_queue. */3194 if (wskb == NULL)
3195 skb_queue_head(&sk->write_queue,skb);
3196 else3197 skb_append(wskb,skb);
3198 wskb = skb;
3199 }3200 else3201 {3202 if (sk->send_head == NULL)
3203 {3204 sk->send_head = skb;
3205 sk->send_tail = skb;
3206 }3207 else3208 {3209 sk->send_tail->link3 = skb;
3210 sk->send_tail = skb;
3211 }3212 skb->link3 = NULL;
3213 }3214 }3215 sti();
3216 }3217
3218 /*3219 * Pipe has emptied3220 */3221
3222 if (sk->send_tail == NULL || sk->send_head == NULL)
3223 {3224 sk->send_head = NULL;
3225 sk->send_tail = NULL;
3226 sk->packets_out= 0;
3227 }3228
3229 /*3230 * Update the right hand window edge of the host3231 */3232
3233 sk->window_seq = ack + ntohs(th->window);
3234
3235 /*3236 * We don't want too many packets out there. 3237 */3238
3239 if (sk->ip_xmit_timeout == TIME_WRITE &&
3240 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3241 {3242 /* 3243 * This is Jacobson's slow start and congestion avoidance. 3244 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3245 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3246 * counter and increment it once every cwnd times. It's possible3247 * that this should be done only if sk->retransmits == 0. I'm3248 * interpreting "new data is acked" as including data that has3249 * been retransmitted but is just now being acked.3250 */3251 if (sk->cong_window < sk->ssthresh)
3252 /* 3253 * In "safe" area, increase3254 */3255 sk->cong_window++;
3256 else3257 {3258 /*3259 * In dangerous area, increase slowly. In theory this is3260 * sk->cong_window += 1 / sk->cong_window3261 */3262 if (sk->cong_count >= sk->cong_window)
3263 {3264 sk->cong_window++;
3265 sk->cong_count = 0;
3266 }3267 else3268 sk->cong_count++;
3269 }3270 }3271
3272 /*3273 * Remember the highest ack received.3274 */3275
3276 sk->rcv_ack_seq = ack;
3277
3278 /*3279 * If this ack opens up a zero window, clear backoff. It was3280 * being used to time the probes, and is probably far higher than3281 * it needs to be for normal retransmission.3282 */3283
3284 if (sk->ip_xmit_timeout == TIME_PROBE0)
3285 {3286 sk->retransmits = 0; /* Our probe was answered */3287
3288 /*3289 * Was it a usable window open ?3290 */3291
3292 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3293 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3294 {3295 sk->backoff = 0;
3296
3297 /*3298 * Recompute rto from rtt. this eliminates any backoff.3299 */3300
3301 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3302 if (sk->rto > 120*HZ)
3303 sk->rto = 120*HZ;
3304 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3305 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3306 .2 of a second is going to need huge windows (SIGH) */3307 sk->rto = 20;
3308 }3309 }3310
3311 /* 3312 * See if we can take anything off of the retransmit queue.3313 */3314
3315 while(sk->send_head != NULL)
3316 {3317 /* Check for a bug. */3318 if (sk->send_head->link3 &&
3319 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3320 printk("INET: tcp.c: *** bug send_list out of order.\n");
3321
3322 /*3323 * If our packet is before the ack sequence we can3324 * discard it as it's confirmed to have arrived the other end.3325 */3326
3327 if (before(sk->send_head->h.seq, ack+1))
3328 {3329 structsk_buff *oskb;
3330 if (sk->retransmits)
3331 {3332 /*3333 * We were retransmitting. don't count this in RTT est 3334 */3335 flag |= 2;
3336
3337 /*3338 * even though we've gotten an ack, we're still3339 * retransmitting as long as we're sending from3340 * the retransmit queue. Keeping retransmits non-zero3341 * prevents us from getting new data interspersed with3342 * retransmissions.3343 */3344
3345 if (sk->send_head->link3) /* Any more queued retransmits? */3346 sk->retransmits = 1;
3347 else3348 sk->retransmits = 0;
3349 }3350 /*3351 * Note that we only reset backoff and rto in the3352 * rtt recomputation code. And that doesn't happen3353 * if there were retransmissions in effect. So the3354 * first new packet after the retransmissions is3355 * sent with the backoff still in effect. Not until3356 * we get an ack from a non-retransmitted packet do3357 * we reset the backoff and rto. This allows us to deal3358 * with a situation where the network delay has increased3359 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3360 */3361
3362 /*3363 * We have one less packet out there. 3364 */3365
3366 if (sk->packets_out > 0)
3367 sk->packets_out --;
3368 /* 3369 * Wake up the process, it can probably write more. 3370 */3371 if (!sk->dead)
3372 sk->write_space(sk);
3373 oskb = sk->send_head;
3374
3375 if (!(flag&2)) /* Not retransmitting */3376 {3377 longm;
3378
3379 /*3380 * The following amusing code comes from Jacobson's3381 * article in SIGCOMM '88. Note that rtt and mdev3382 * are scaled versions of rtt and mean deviation.3383 * This is designed to be as fast as possible 3384 * m stands for "measurement".3385 */3386
3387 m = jiffies - oskb->when; /* RTT */3388 if(m<=0)
3389 m=1; /* IS THIS RIGHT FOR <0 ??? */3390 m -= (sk->rtt >> 3); /* m is now error in rtt est */3391 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3392 if (m < 0)
3393 m = -m; /* m is now abs(error) */3394 m -= (sk->mdev >> 2); /* similar update on mdev */3395 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3396
3397 /*3398 * Now update timeout. Note that this removes any backoff.3399 */3400
3401 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3402 if (sk->rto > 120*HZ)
3403 sk->rto = 120*HZ;
3404 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3405 sk->rto = 20;
3406 sk->backoff = 0;
3407 }3408 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3409 In this case as we just set it up */3410 cli();
3411 oskb = sk->send_head;
3412 IS_SKB(oskb);
3413 sk->send_head = oskb->link3;
3414 if (sk->send_head == NULL)
3415 {3416 sk->send_tail = NULL;
3417 }3418
3419 /*3420 * We may need to remove this from the dev send list. 3421 */3422
3423 if (oskb->next)
3424 skb_unlink(oskb);
3425 sti();
3426 kfree_skb(oskb, FREE_WRITE); /* write. */3427 if (!sk->dead)
3428 sk->write_space(sk);
3429 }3430 else3431 {3432 break;
3433 }3434 }3435
3436 /*3437 * XXX someone ought to look at this too.. at the moment, if skb_peek()3438 * returns non-NULL, we complete ignore the timer stuff in the else3439 * clause. We ought to organize the code so that else clause can3440 * (should) be executed regardless, possibly moving the PROBE timer3441 * reset over. The skb_peek() thing should only move stuff to the3442 * write queue, NOT also manage the timer functions.3443 */3444
3445 /*3446 * Maybe we can take some stuff off of the write queue,3447 * and put it onto the xmit queue.3448 */3449 if (skb_peek(&sk->write_queue) != NULL)
3450 {3451 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3452 (sk->retransmits == 0 ||
3453 sk->ip_xmit_timeout != TIME_WRITE ||
3454 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3455 && sk->packets_out < sk->cong_window)
3456 {3457 /*3458 * Add more data to the send queue.3459 */3460 flag |= 1;
3461 tcp_write_xmit(sk);
3462 }3463 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3464 sk->send_head == NULL &&
3465 sk->ack_backlog == 0 &&
3466 sk->state != TCP_TIME_WAIT)
3467 {3468 /*3469 * Data to queue but no room.3470 */3471 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3472 }3473 }3474 else3475 {3476 /*3477 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3478 * from TCP_CLOSE we don't do anything3479 *3480 * from anything else, if there is write data (or fin) pending,3481 * we use a TIME_WRITE timeout, else if keepalive we reset to3482 * a KEEPALIVE timeout, else we delete the timer.3483 *3484 * We do not set flag for nominal write data, otherwise we may3485 * force a state where we start to write itsy bitsy tidbits3486 * of data.3487 */3488
3489 switch(sk->state) {3490 caseTCP_TIME_WAIT:
3491 /*3492 * keep us in TIME_WAIT until we stop getting packets,3493 * reset the timeout.3494 */3495 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3496 break;
3497 caseTCP_CLOSE:
3498 /*3499 * don't touch the timer.3500 */3501 break;
3502 default:
3503 /*3504 * Must check send_head, write_queue, and ack_backlog3505 * to determine which timeout to use.3506 */3507 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3508 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3509 }elseif (sk->keepopen) {3510 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3511 }else{3512 del_timer(&sk->retransmit_timer);
3513 sk->ip_xmit_timeout = 0;
3514 }3515 break;
3516 }3517 }3518
3519 /*3520 * We have nothing queued but space to send. Send any partial3521 * packets immediately (end of Nagle rule application).3522 */3523
3524 if (sk->packets_out == 0 && sk->partial != NULL &&
3525 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3526 {3527 flag |= 1;
3528 tcp_send_partial(sk);
3529 }3530
3531 /*3532 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3533 * we are now waiting for an acknowledge to our FIN. The other end is3534 * already in TIME_WAIT.3535 *3536 * Move to TCP_CLOSE on success.3537 */3538
3539 if (sk->state == TCP_LAST_ACK)
3540 {3541 if (!sk->dead)
3542 sk->state_change(sk);
3543 if(sk->debug)
3544 printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3545 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3546 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3547 {3548 flag |= 1;
3549 tcp_set_state(sk,TCP_CLOSE);
3550 sk->shutdown = SHUTDOWN_MASK;
3551 }3552 }3553
3554 /*3555 * Incoming ACK to a FIN we sent in the case of our initiating the close.3556 *3557 * Move to FIN_WAIT2 to await a FIN from the other end. Set3558 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3559 */3560
3561 if (sk->state == TCP_FIN_WAIT1)
3562 {3563
3564 if (!sk->dead)
3565 sk->state_change(sk);
3566 if (sk->rcv_ack_seq == sk->write_seq)
3567 {3568 flag |= 1;
3569 sk->shutdown |= SEND_SHUTDOWN;
3570 tcp_set_state(sk, TCP_FIN_WAIT2);
3571 }3572 }3573
3574 /*3575 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3576 *3577 * Move to TIME_WAIT3578 */3579
3580 if (sk->state == TCP_CLOSING)
3581 {3582
3583 if (!sk->dead)
3584 sk->state_change(sk);
3585 if (sk->rcv_ack_seq == sk->write_seq)
3586 {3587 flag |= 1;
3588 tcp_time_wait(sk);
3589 }3590 }3591
3592 /*3593 * Final ack of a three way shake 3594 */3595
3596 if(sk->state==TCP_SYN_RECV)
3597 {3598 tcp_set_state(sk, TCP_ESTABLISHED);
3599 tcp_options(sk,th);
3600 sk->dummy_th.dest=th->source;
3601 sk->copied_seq = sk->acked_seq;
3602 if(!sk->dead)
3603 sk->state_change(sk);
3604 if(sk->max_window==0)
3605 {3606 sk->max_window=32; /* Sanity check */3607 sk->mss=min(sk->max_window,sk->mtu);
3608 }3609 }3610
3611 /*3612 * I make no guarantees about the first clause in the following3613 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3614 * what conditions "!flag" would be true. However I think the rest3615 * of the conditions would prevent that from causing any3616 * unnecessary retransmission. 3617 * Clearly if the first packet has expired it should be 3618 * retransmitted. The other alternative, "flag&2 && retransmits", is3619 * harder to explain: You have to look carefully at how and when the3620 * timer is set and with what timeout. The most recent transmission always3621 * sets the timer. So in general if the most recent thing has timed3622 * out, everything before it has as well. So we want to go ahead and3623 * retransmit some more. If we didn't explicitly test for this3624 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3625 * would not be true. If you look at the pattern of timing, you can3626 * show that rto is increased fast enough that the next packet would3627 * almost never be retransmitted immediately. Then you'd end up3628 * waiting for a timeout to send each packet on the retransmission3629 * queue. With my implementation of the Karn sampling algorithm,3630 * the timeout would double each time. The net result is that it would3631 * take a hideous amount of time to recover from a single dropped packet.3632 * It's possible that there should also be a test for TIME_WRITE, but3633 * I think as long as "send_head != NULL" and "retransmit" is on, we've3634 * got to be in real retransmission mode.3635 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3636 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3637 * As long as no further losses occur, this seems reasonable.3638 */3639
3640 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3641 (((flag&2) && sk->retransmits) ||
3642 (sk->send_head->when + sk->rto < jiffies)))
3643 {3644 if(sk->send_head->when + sk->rto < jiffies)
3645 tcp_retransmit(sk,0);
3646 else3647 {3648 tcp_do_retransmit(sk, 1);
3649 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3650 }3651 }3652
3653 return(1);
3654 }3655
3656
3657 /*3658 * Process the FIN bit. This now behaves as it is supposed to work3659 * and the FIN takes effect when it is validly part of sequence3660 * space. Not before when we get holes.3661 *3662 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3663 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3664 * TIME-WAIT)3665 *3666 * If we are in FINWAIT-1, a received FIN indicates simultaneous3667 * close and we go into CLOSING (and later onto TIME-WAIT)3668 *3669 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3670 *3671 */3672
3673 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3674 {3675 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3676
3677 if (!sk->dead)
3678 {3679 sk->state_change(sk);
3680 sock_wake_async(sk->socket, 1);
3681 }3682
3683 switch(sk->state)
3684 {3685 caseTCP_SYN_RECV:
3686 caseTCP_SYN_SENT:
3687 caseTCP_ESTABLISHED:
3688 /*3689 * move to CLOSE_WAIT, tcp_data() already handled3690 * sending the ack.3691 */3692 tcp_set_state(sk,TCP_CLOSE_WAIT);
3693 if (th->rst)
3694 sk->shutdown = SHUTDOWN_MASK;
3695 break;
3696
3697 caseTCP_CLOSE_WAIT:
3698 caseTCP_CLOSING:
3699 /*3700 * received a retransmission of the FIN, do3701 * nothing.3702 */3703 break;
3704 caseTCP_TIME_WAIT:
3705 /*3706 * received a retransmission of the FIN,3707 * restart the TIME_WAIT timer.3708 */3709 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3710 return(0);
3711 caseTCP_FIN_WAIT1:
3712 /*3713 * This case occurs when a simultaneous close3714 * happens, we must ack the received FIN and3715 * enter the CLOSING state.3716 *3717 * This causes a WRITE timeout, which will either3718 * move on to TIME_WAIT when we timeout, or resend3719 * the FIN properly (maybe we get rid of that annoying3720 * FIN lost hang). The TIME_WRITE code is already correct3721 * for handling this timeout.3722 */3723
3724 if(sk->ip_xmit_timeout != TIME_WRITE)
3725 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3726 tcp_set_state(sk,TCP_CLOSING);
3727 break;
3728 caseTCP_FIN_WAIT2:
3729 /*3730 * received a FIN -- send ACK and enter TIME_WAIT3731 */3732 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3733 sk->shutdown|=SHUTDOWN_MASK;
3734 tcp_set_state(sk,TCP_TIME_WAIT);
3735 break;
3736 caseTCP_CLOSE:
3737 /*3738 * already in CLOSE3739 */3740 break;
3741 default:
3742 tcp_set_state(sk,TCP_LAST_ACK);
3743
3744 /* Start the timers. */3745 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3746 return(0);
3747 }3748
3749 return(0);
3750 }3751
3752
3753
3754 /*3755 * This routine handles the data. If there is room in the buffer,3756 * it will be have already been moved into it. If there is no3757 * room, then we will just have to discard the packet.3758 */3759
3760 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */3761 unsignedlongsaddr, unsignedshortlen)
3762 {3763 structsk_buff *skb1, *skb2;
3764 structtcphdr *th;
3765 intdup_dumped=0;
3766 unsignedlongnew_seq;
3767 unsignedlongshut_seq;
3768
3769 th = skb->h.th;
3770 skb->len = len -(th->doff*4);
3771
3772 /*3773 * The bytes in the receive read/assembly queue has increased. Needed for the3774 * low memory discard algorithm 3775 */3776
3777 sk->bytes_rcv += skb->len;
3778
3779 if (skb->len == 0 && !th->fin)
3780 {3781 /* 3782 * Don't want to keep passing ack's back and forth. 3783 * (someone sent us dataless, boring frame)3784 */3785 if (!th->ack)
3786 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3787 kfree_skb(skb, FREE_READ);
3788 return(0);
3789 }3790
3791 /*3792 * We no longer have anyone receiving data on this connection.3793 */3794
3795 #ifndef TCP_DONT_RST_SHUTDOWN
3796
3797 if(sk->shutdown & RCV_SHUTDOWN)
3798 {3799 /*3800 * FIXME: BSD has some magic to avoid sending resets to3801 * broken 4.2 BSD keepalives. Much to my surprise a few non3802 * BSD stacks still have broken keepalives so we want to3803 * cope with it.3804 */3805
3806 if(skb->len) /* We don't care if it's just an ack or3807 a keepalive/window probe */3808 {3809 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */3810
3811 /* Do this the way 4.4BSD treats it. Not what I'd3812 regard as the meaning of the spec but it's what BSD3813 does and clearly they know everything 8) */3814
3815 /*3816 * This is valid because of two things3817 *3818 * a) The way tcp_data behaves at the bottom.3819 * b) A fin takes effect when read not when received.3820 */3821
3822 shut_seq=sk->acked_seq+1; /* Last byte */3823
3824 if(after(new_seq,shut_seq))
3825 {3826 if(sk->debug)
3827 printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3828 sk, new_seq, shut_seq, sk->blog);
3829 if(sk->dead)
3830 {3831 sk->acked_seq = new_seq + th->fin;
3832 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3833 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3834 tcp_statistics.TcpEstabResets++;
3835 tcp_set_state(sk,TCP_CLOSE);
3836 sk->err = EPIPE;
3837 sk->shutdown = SHUTDOWN_MASK;
3838 kfree_skb(skb, FREE_READ);
3839 return 0;
3840 }3841 }3842 }3843 }3844
3845 #endif3846
3847 /*3848 * Now we have to walk the chain, and figure out where this one3849 * goes into it. This is set up so that the last packet we received3850 * will be the first one we look at, that way if everything comes3851 * in order, there will be no performance loss, and if they come3852 * out of order we will be able to fit things in nicely.3853 *3854 * [AC: This is wrong. We should assume in order first and then walk3855 * forwards from the first hole based upon real traffic patterns.]3856 * 3857 */3858
3859 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */3860 {3861 skb_queue_head(&sk->receive_queue,skb);
3862 skb1= NULL;
3863 }3864 else3865 {3866 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3867 {3868 if(sk->debug)
3869 {3870 printk("skb1=%p :", skb1);
3871 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3872 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3873 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3874 sk->acked_seq);
3875 }3876
3877 /*3878 * Optimisation: Duplicate frame or extension of previous frame from3879 * same sequence point (lost ack case).3880 * The frame contains duplicate data or replaces a previous frame3881 * discard the previous frame (safe as sk->inuse is set) and put3882 * the new one in its place.3883 */3884
3885 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3886 {3887 skb_append(skb1,skb);
3888 skb_unlink(skb1);
3889 kfree_skb(skb1,FREE_READ);
3890 dup_dumped=1;
3891 skb1=NULL;
3892 break;
3893 }3894
3895 /*3896 * Found where it fits3897 */3898
3899 if (after(th->seq+1, skb1->h.th->seq))
3900 {3901 skb_append(skb1,skb);
3902 break;
3903 }3904
3905 /*3906 * See if we've hit the start. If so insert.3907 */3908 if (skb1 == skb_peek(&sk->receive_queue))
3909 {3910 skb_queue_head(&sk->receive_queue, skb);
3911 break;
3912 }3913 }3914 }3915
3916 /*3917 * Figure out what the ack value for this frame is3918 */3919
3920 th->ack_seq = th->seq + skb->len;
3921 if (th->syn)
3922 th->ack_seq++;
3923 if (th->fin)
3924 th->ack_seq++;
3925
3926 if (before(sk->acked_seq, sk->copied_seq))
3927 {3928 printk("*** tcp.c:tcp_data bug acked < copied\n");
3929 sk->acked_seq = sk->copied_seq;
3930 }3931
3932 /*3933 * Now figure out if we can ack anything. This is very messy because we really want two3934 * receive queues, a completed and an assembly queue. We also want only one transmit3935 * queue.3936 */3937
3938 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3939 {3940 if (before(th->seq, sk->acked_seq+1))
3941 {3942 intnewwindow;
3943
3944 if (after(th->ack_seq, sk->acked_seq))
3945 {3946 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3947 if (newwindow < 0)
3948 newwindow = 0;
3949 sk->window = newwindow;
3950 sk->acked_seq = th->ack_seq;
3951 }3952 skb->acked = 1;
3953
3954 /*3955 * When we ack the fin, we do the FIN 3956 * processing.3957 */3958
3959 if (skb->h.th->fin)
3960 {3961 tcp_fin(skb,sk,skb->h.th);
3962 }3963
3964 for(skb2 = skb->next;
3965 skb2 != (structsk_buff *)&sk->receive_queue;
3966 skb2 = skb2->next)
3967 {3968 if (before(skb2->h.th->seq, sk->acked_seq+1))
3969 {3970 if (after(skb2->h.th->ack_seq, sk->acked_seq))
3971 {3972 newwindow = sk->window -
3973 (skb2->h.th->ack_seq - sk->acked_seq);
3974 if (newwindow < 0)
3975 newwindow = 0;
3976 sk->window = newwindow;
3977 sk->acked_seq = skb2->h.th->ack_seq;
3978 }3979 skb2->acked = 1;
3980 /*3981 * When we ack the fin, we do3982 * the fin handling.3983 */3984 if (skb2->h.th->fin)
3985 {3986 tcp_fin(skb,sk,skb->h.th);
3987 }3988
3989 /*3990 * Force an immediate ack.3991 */3992
3993 sk->ack_backlog = sk->max_ack_backlog;
3994 }3995 else3996 {3997 break;
3998 }3999 }4000
4001 /*4002 * This also takes care of updating the window.4003 * This if statement needs to be simplified.4004 */4005 if (!sk->delay_acks ||
4006 sk->ack_backlog >= sk->max_ack_backlog ||
4007 sk->bytes_rcv > sk->max_unacked || th->fin) {4008 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4009 }4010 else4011 {4012 sk->ack_backlog++;
4013 if(sk->debug)
4014 printk("Ack queued.\n");
4015 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4016 }4017 }4018 }4019
4020 /*4021 * If we've missed a packet, send an ack.4022 * Also start a timer to send another.4023 */4024
4025 if (!skb->acked)
4026 {4027
4028 /*4029 * This is important. If we don't have much room left,4030 * we need to throw out a few packets so we have a good4031 * window. Note that mtu is used, not mss, because mss is really4032 * for the send side. He could be sending us stuff as large as mtu.4033 */4034
4035 while (sk->prot->rspace(sk) < sk->mtu)
4036 {4037 skb1 = skb_peek(&sk->receive_queue);
4038 if (skb1 == NULL)
4039 {4040 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4041 break;
4042 }4043
4044 /*4045 * Don't throw out something that has been acked. 4046 */4047
4048 if (skb1->acked)
4049 {4050 break;
4051 }4052
4053 skb_unlink(skb1);
4054 kfree_skb(skb1, FREE_READ);
4055 }4056 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4057 sk->ack_backlog++;
4058 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4059 }4060 else4061 {4062 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4063 }4064
4065 /*4066 * Now tell the user we may have some data. 4067 */4068
4069 if (!sk->dead)
4070 {4071 if(sk->debug)
4072 printk("Data wakeup.\n");
4073 sk->data_ready(sk,0);
4074 }4075 return(0);
4076 }4077
4078
4079 /*4080 * This routine is only called when we have urgent data4081 * signalled. Its the 'slow' part of tcp_urg. It could be4082 * moved inline now as tcp_urg is only called from one4083 * place. We handle URGent data wrong. We have to - as4084 * BSD still doesn't use the correction from RFC961.4085 */4086
4087 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4088 {4089 unsignedlongptr = ntohs(th->urg_ptr);
4090
4091 if (ptr)
4092 ptr--;
4093 ptr += th->seq;
4094
4095 /* ignore urgent data that we've already seen and read */4096 if (after(sk->copied_seq, ptr))
4097 return;
4098
4099 /* do we already have a newer (or duplicate) urgent pointer? */4100 if (sk->urg_data && !after(ptr, sk->urg_seq))
4101 return;
4102
4103 /* tell the world about our new urgent pointer */4104 if (sk->proc != 0) {4105 if (sk->proc > 0) {4106 kill_proc(sk->proc, SIGURG, 1);
4107 }else{4108 kill_pg(-sk->proc, SIGURG, 1);
4109 }4110 }4111 sk->urg_data = URG_NOTYET;
4112 sk->urg_seq = ptr;
4113 }4114
4115 /*4116 * This is the 'fast' part of urgent handling.4117 */4118
4119 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4120 unsignedlongsaddr, unsignedlonglen)
4121 {4122 unsignedlongptr;
4123
4124 /*4125 * Check if we get a new urgent pointer - normally not 4126 */4127
4128 if (th->urg)
4129 tcp_check_urg(sk,th);
4130
4131 /*4132 * Do we wait for any urgent data? - normally not4133 */4134
4135 if (sk->urg_data != URG_NOTYET)
4136 return 0;
4137
4138 /*4139 * Is the urgent pointer pointing into this packet? 4140 */4141
4142 ptr = sk->urg_seq - th->seq + th->doff*4;
4143 if (ptr >= len)
4144 return 0;
4145
4146 /*4147 * Ok, got the correct packet, update info 4148 */4149
4150 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4151 if (!sk->dead)
4152 sk->data_ready(sk,0);
4153 return 0;
4154 }4155
4156 /*4157 * This will accept the next outstanding connection. 4158 */4159
4160 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4161 {4162 structsock *newsk;
4163 structsk_buff *skb;
4164
4165 /*4166 * We need to make sure that this socket is listening,4167 * and that it has something pending.4168 */4169
4170 if (sk->state != TCP_LISTEN)
4171 {4172 sk->err = EINVAL;
4173 return(NULL);
4174 }4175
4176 /* Avoid the race. */4177 cli();
4178 sk->inuse = 1;
4179
4180 while((skb = tcp_dequeue_established(sk)) == NULL)
4181 {4182 if (flags & O_NONBLOCK)
4183 {4184 sti();
4185 release_sock(sk);
4186 sk->err = EAGAIN;
4187 return(NULL);
4188 }4189
4190 release_sock(sk);
4191 interruptible_sleep_on(sk->sleep);
4192 if (current->signal & ~current->blocked)
4193 {4194 sti();
4195 sk->err = ERESTARTSYS;
4196 return(NULL);
4197 }4198 sk->inuse = 1;
4199 }4200 sti();
4201
4202 /*4203 * Now all we need to do is return skb->sk. 4204 */4205
4206 newsk = skb->sk;
4207
4208 kfree_skb(skb, FREE_READ);
4209 sk->ack_backlog--;
4210 release_sock(sk);
4211 return(newsk);
4212 }4213
4214
4215 /*4216 * This will initiate an outgoing connection. 4217 */4218
4219 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4220 {4221 structsk_buff *buff;
4222 structdevice *dev=NULL;
4223 unsignedchar *ptr;
4224 inttmp;
4225 intatype;
4226 structtcphdr *t1;
4227 structrtable *rt;
4228
4229 if (sk->state != TCP_CLOSE)
4230 {4231 return(-EISCONN);
4232 }4233
4234 if (addr_len < 8)
4235 return(-EINVAL);
4236
4237 if (usin->sin_family && usin->sin_family != AF_INET)
4238 return(-EAFNOSUPPORT);
4239
4240 /*4241 * connect() to INADDR_ANY means loopback (BSD'ism).4242 */4243
4244 if(usin->sin_addr.s_addr==INADDR_ANY)
4245 usin->sin_addr.s_addr=ip_my_addr();
4246
4247 /*4248 * Don't want a TCP connection going to a broadcast address 4249 */4250
4251 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4252 return -ENETUNREACH;
4253
4254 sk->inuse = 1;
4255 sk->daddr = usin->sin_addr.s_addr;
4256 sk->write_seq = tcp_init_seq();
4257 sk->window_seq = sk->write_seq;
4258 sk->rcv_ack_seq = sk->write_seq -1;
4259 sk->err = 0;
4260 sk->dummy_th.dest = usin->sin_port;
4261 release_sock(sk);
4262
4263 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4264 if (buff == NULL)
4265 {4266 return(-ENOMEM);
4267 }4268 sk->inuse = 1;
4269 buff->len = 24;
4270 buff->sk = sk;
4271 buff->free = 0;
4272 buff->localroute = sk->localroute;
4273
4274 t1 = (structtcphdr *) buff->data;
4275
4276 /*4277 * Put in the IP header and routing stuff. 4278 */4279
4280 rt=ip_rt_route(sk->daddr, NULL, NULL);
4281
4282
4283 /*4284 * We need to build the routing stuff from the things saved in skb. 4285 */4286
4287 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4288 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4289 if (tmp < 0)
4290 {4291 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4292 release_sock(sk);
4293 return(-ENETUNREACH);
4294 }4295
4296 buff->len += tmp;
4297 t1 = (structtcphdr *)((char *)t1 +tmp);
4298
4299 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4300 t1->seq = ntohl(sk->write_seq++);
4301 sk->sent_seq = sk->write_seq;
4302 buff->h.seq = sk->write_seq;
4303 t1->ack = 0;
4304 t1->window = 2;
4305 t1->res1=0;
4306 t1->res2=0;
4307 t1->rst = 0;
4308 t1->urg = 0;
4309 t1->psh = 0;
4310 t1->syn = 1;
4311 t1->urg_ptr = 0;
4312 t1->doff = 6;
4313 /* use 512 or whatever user asked for */4314
4315 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4316 sk->window_clamp=rt->rt_window;
4317 else4318 sk->window_clamp=0;
4319
4320 if (sk->user_mss)
4321 sk->mtu = sk->user_mss;
4322 elseif(rt!=NULL && (rt->rt_flags&RTF_MTU))
4323 sk->mtu = rt->rt_mss;
4324 else4325 {4326 #ifdefCONFIG_INET_SNARL4327 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4328 #else4329 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4330 #endif4331 sk->mtu = 576 - HEADER_SIZE;
4332 else4333 sk->mtu = MAX_WINDOW;
4334 }4335 /*4336 * but not bigger than device MTU 4337 */4338
4339 if(sk->mtu <32)
4340 sk->mtu = 32; /* Sanity limit */4341
4342 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4343
4344 /*4345 * Put in the TCP options to say MTU. 4346 */4347
4348 ptr = (unsignedchar *)(t1+1);
4349 ptr[0] = 2;
4350 ptr[1] = 4;
4351 ptr[2] = (sk->mtu) >> 8;
4352 ptr[3] = (sk->mtu) & 0xff;
4353 tcp_send_check(t1, sk->saddr, sk->daddr,
4354 sizeof(structtcphdr) + 4, sk);
4355
4356 /*4357 * This must go first otherwise a really quick response will get reset. 4358 */4359
4360 tcp_cache_zap();
4361 tcp_set_state(sk,TCP_SYN_SENT);
4362 if(rt&&rt->rt_flags&RTF_IRTT)
4363 sk->rto = rt->rt_irtt;
4364 else4365 sk->rto = TCP_TIMEOUT_INIT;
4366 sk->retransmit_timer.function=&retransmit_timer;
4367 sk->retransmit_timer.data = (unsignedlong)sk;
4368 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4369 sk->retransmits = TCP_SYN_RETRIES;
4370
4371 sk->prot->queue_xmit(sk, dev, buff, 0);
4372 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4373 tcp_statistics.TcpActiveOpens++;
4374 tcp_statistics.TcpOutSegs++;
4375
4376 release_sock(sk);
4377 return(0);
4378 }4379
4380
4381 /* This functions checks to see if the tcp header is actually acceptable. */4382 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4383 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4384 {4385 unsignedlongnext_seq;
4386
4387 next_seq = len - 4*th->doff;
4388 if (th->fin)
4389 next_seq++;
4390 /* if we have a zero window, we can't have any data in the packet.. */4391 if (next_seq && !sk->window)
4392 gotoignore_it;
4393 next_seq += th->seq;
4394
4395 /*4396 * This isn't quite right. sk->acked_seq could be more recent4397 * than sk->window. This is however close enough. We will accept4398 * slightly more packets than we should, but it should not cause4399 * problems unless someone is trying to forge packets.4400 */4401
4402 /* have we already seen all of this packet? */4403 if (!after(next_seq+1, sk->acked_seq))
4404 gotoignore_it;
4405 /* or does it start beyond the window? */4406 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4407 gotoignore_it;
4408
4409 /* ok, at least part of this packet would seem interesting.. */4410 return 1;
4411
4412 ignore_it:
4413 if (th->rst)
4414 return 0;
4415
4416 /*4417 * Send a reset if we get something not ours and we are4418 * unsynchronized. Note: We don't do anything to our end. We4419 * are just killing the bogus remote connection then we will4420 * connect again and it will work (with luck).4421 */4422
4423 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4424 {4425 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4426 return 1;
4427 }4428
4429 /* Try to resync things. */4430 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4431 return 0;
4432 }4433
4434 /*4435 * When we get a reset we do this.4436 */4437
4438 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4439 {4440 sk->zapped = 1;
4441 sk->err = ECONNRESET;
4442 if (sk->state == TCP_SYN_SENT)
4443 sk->err = ECONNREFUSED;
4444 if (sk->state == TCP_CLOSE_WAIT)
4445 sk->err = EPIPE;
4446 #ifdef TCP_DO_RFC1337
4447 /*4448 * Time wait assassination protection [RFC1337]4449 */4450 if(sk->state!=TCP_TIME_WAIT)
4451 {4452 tcp_set_state(sk,TCP_CLOSE);
4453 sk->shutdown = SHUTDOWN_MASK;
4454 }4455 #else4456 tcp_set_state(sk,TCP_CLOSE);
4457 sk->shutdown = SHUTDOWN_MASK;
4458 #endif4459 if (!sk->dead)
4460 sk->state_change(sk);
4461 kfree_skb(skb, FREE_READ);
4462 release_sock(sk);
4463 return(0);
4464 }4465
4466 /*4467 * A TCP packet has arrived.4468 */4469
4470 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4471 unsignedlongdaddr, unsignedshortlen,
4472 unsignedlongsaddr, intredo, structinet_protocol * protocol)
4473 {4474 structtcphdr *th;
4475 structsock *sk;
4476 intsyn_ok=0;
4477
4478 tcp_statistics.TcpInSegs++;
4479
4480 if(skb->pkt_type!=PACKET_HOST)
4481 {4482 kfree_skb(skb,FREE_READ);
4483 return(0);
4484 }4485
4486 th = skb->h.th;
4487
4488 /*4489 * Find the socket, using the last hit cache if applicable.4490 */4491
4492 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4493 sk=(structsock *)th_cache_sk;
4494 else4495 {4496 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4497 th_cache_saddr=saddr;
4498 th_cache_daddr=daddr;
4499 th_cache_dport=th->dest;
4500 th_cache_sport=th->source;
4501 th_cache_sk=sk;
4502 }4503
4504 /*4505 * If this socket has got a reset it's to all intents and purposes 4506 * really dead. Count closed sockets as dead.4507 *4508 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4509 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4510 * exist so should cause resets as if the port was unreachable.4511 */4512
4513 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4514 sk=NULL;
4515
4516 if (!redo)
4517 {4518 if (tcp_check(th, len, saddr, daddr ))
4519 {4520 skb->sk = NULL;
4521 kfree_skb(skb,FREE_READ);
4522 /*4523 * We don't release the socket because it was4524 * never marked in use.4525 */4526 return(0);
4527 }4528 th->seq = ntohl(th->seq);
4529
4530 /* See if we know about the socket. */4531 if (sk == NULL)
4532 {4533 /*4534 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4535 */4536 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4537 skb->sk = NULL;
4538 /*4539 * Discard frame4540 */4541 kfree_skb(skb, FREE_READ);
4542 return(0);
4543 }4544
4545 skb->len = len;
4546 skb->acked = 0;
4547 skb->used = 0;
4548 skb->free = 0;
4549 skb->saddr = daddr;
4550 skb->daddr = saddr;
4551
4552 /* We may need to add it to the backlog here. */4553 cli();
4554 if (sk->inuse)
4555 {4556 skb_queue_tail(&sk->back_log, skb);
4557 sti();
4558 return(0);
4559 }4560 sk->inuse = 1;
4561 sti();
4562 }4563 else4564 {4565 if (sk==NULL)
4566 {4567 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4568 skb->sk = NULL;
4569 kfree_skb(skb, FREE_READ);
4570 return(0);
4571 }4572 }4573
4574
4575 if (!sk->prot)
4576 {4577 printk("IMPOSSIBLE 3\n");
4578 return(0);
4579 }4580
4581
4582 /*4583 * Charge the memory to the socket. 4584 */4585
4586 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
4587 {4588 kfree_skb(skb, FREE_READ);
4589 release_sock(sk);
4590 return(0);
4591 }4592
4593 skb->sk=sk;
4594 sk->rmem_alloc += skb->mem_len;
4595
4596 /*4597 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4598 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4599 * compatibility. We also set up variables more thoroughly [Karn notes in the4600 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4601 */4602
4603 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4604 {4605
4606 /*4607 * Now deal with unusual cases.4608 */4609
4610 if(sk->state==TCP_LISTEN)
4611 {4612 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4613 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4614
4615 /*4616 * We don't care for RST, and non SYN are absorbed (old segments)4617 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4618 * netmask on a running connection it can go broadcast. Even Sun's have4619 * this problem so I'm ignoring it 4620 */4621
4622 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4623 {4624 kfree_skb(skb, FREE_READ);
4625 release_sock(sk);
4626 return 0;
4627 }4628
4629 /* 4630 * Guess we need to make a new socket up 4631 */4632
4633 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4634
4635 /*4636 * Now we have several options: In theory there is nothing else4637 * in the frame. KA9Q has an option to send data with the syn,4638 * BSD accepts data with the syn up to the [to be] advertised window4639 * and Solaris 2.1 gives you a protocol error. For now we just ignore4640 * it, that fits the spec precisely and avoids incompatibilities. It4641 * would be nice in future to drop through and process the data.4642 */4643
4644 release_sock(sk);
4645 return 0;
4646 }4647
4648 /* retransmitted SYN? */4649 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4650 {4651 kfree_skb(skb, FREE_READ);
4652 release_sock(sk);
4653 return 0;
4654 }4655
4656 /*4657 * SYN sent means we have to look for a suitable ack and either reset4658 * for bad matches or go to connected 4659 */4660
4661 if(sk->state==TCP_SYN_SENT)
4662 {4663 /* Crossed SYN or previous junk segment */4664 if(th->ack)
4665 {4666 /* We got an ack, but it's not a good ack */4667 if(!tcp_ack(sk,th,saddr,len))
4668 {4669 /* Reset the ack - its an ack from a 4670 different connection [ th->rst is checked in tcp_reset()] */4671 tcp_statistics.TcpAttemptFails++;
4672 tcp_reset(daddr, saddr, th,
4673 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4674 kfree_skb(skb, FREE_READ);
4675 release_sock(sk);
4676 return(0);
4677 }4678 if(th->rst)
4679 returntcp_std_reset(sk,skb);
4680 if(!th->syn)
4681 {4682 /* A valid ack from a different connection4683 start. Shouldn't happen but cover it */4684 kfree_skb(skb, FREE_READ);
4685 release_sock(sk);
4686 return 0;
4687 }4688 /*4689 * Ok.. it's good. Set up sequence numbers and4690 * move to established.4691 */4692 syn_ok=1; /* Don't reset this connection for the syn */4693 sk->acked_seq=th->seq+1;
4694 sk->fin_seq=th->seq;
4695 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4696 tcp_set_state(sk, TCP_ESTABLISHED);
4697 tcp_options(sk,th);
4698 sk->dummy_th.dest=th->source;
4699 sk->copied_seq = sk->acked_seq;
4700 if(!sk->dead)
4701 {4702 sk->state_change(sk);
4703 sock_wake_async(sk->socket, 0);
4704 }4705 if(sk->max_window==0)
4706 {4707 sk->max_window = 32;
4708 sk->mss = min(sk->max_window, sk->mtu);
4709 }4710 }4711 else4712 {4713 /* See if SYN's cross. Drop if boring */4714 if(th->syn && !th->rst)
4715 {4716 /* Crossed SYN's are fine - but talking to4717 yourself is right out... */4718 if(sk->saddr==saddr && sk->daddr==daddr &&
4719 sk->dummy_th.source==th->source &&
4720 sk->dummy_th.dest==th->dest)
4721 {4722 tcp_statistics.TcpAttemptFails++;
4723 returntcp_std_reset(sk,skb);
4724 }4725 tcp_set_state(sk,TCP_SYN_RECV);
4726
4727 /*4728 * FIXME:4729 * Must send SYN|ACK here4730 */4731 }4732 /* Discard junk segment */4733 kfree_skb(skb, FREE_READ);
4734 release_sock(sk);
4735 return 0;
4736 }4737 /*4738 * SYN_RECV with data maybe.. drop through4739 */4740 gotorfc_step6;
4741 }4742
4743 /*4744 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is4745 * a more complex suggestion for fixing these reuse issues in RFC16444746 * but not yet ready for general use. Also see RFC1379.4747 */4748
4749 #defineBSD_TIME_WAIT4750 #ifdefBSD_TIME_WAIT4751 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4752 after(th->seq, sk->acked_seq) && !th->rst)
4753 {4754 longseq=sk->write_seq;
4755 if(sk->debug)
4756 printk("Doing a BSD time wait\n");
4757 tcp_statistics.TcpEstabResets++;
4758 sk->rmem_alloc -= skb->mem_len;
4759 skb->sk = NULL;
4760 sk->err=ECONNRESET;
4761 tcp_set_state(sk, TCP_CLOSE);
4762 sk->shutdown = SHUTDOWN_MASK;
4763 release_sock(sk);
4764 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4765 if (sk && sk->state==TCP_LISTEN)
4766 {4767 sk->inuse=1;
4768 skb->sk = sk;
4769 sk->rmem_alloc += skb->mem_len;
4770 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4771 release_sock(sk);
4772 return 0;
4773 }4774 kfree_skb(skb, FREE_READ);
4775 return 0;
4776 }4777 #endif4778 }4779
4780 /*4781 * We are now in normal data flow (see the step list in the RFC)4782 * Note most of these are inline now. I'll inline the lot when4783 * I have time to test it hard and look at what gcc outputs 4784 */4785
4786 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4787 {4788 kfree_skb(skb, FREE_READ);
4789 release_sock(sk);
4790 return 0;
4791 }4792
4793 if(th->rst)
4794 returntcp_std_reset(sk,skb);
4795
4796 /*4797 * !syn_ok is effectively the state test in RFC793.4798 */4799
4800 if(th->syn && !syn_ok)
4801 {4802 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4803 returntcp_std_reset(sk,skb);
4804 }4805
4806 /*4807 * Process the ACK4808 */4809
4810
4811 if(th->ack && !tcp_ack(sk,th,saddr,len))
4812 {4813 /*4814 * Our three way handshake failed.4815 */4816
4817 if(sk->state==TCP_SYN_RECV)
4818 {4819 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4820 }4821 kfree_skb(skb, FREE_READ);
4822 release_sock(sk);
4823 return 0;
4824 }4825
4826 rfc_step6: /* I'll clean this up later */4827
4828 /*4829 * Process urgent data4830 */4831
4832 if(tcp_urg(sk, th, saddr, len))
4833 {4834 kfree_skb(skb, FREE_READ);
4835 release_sock(sk);
4836 return 0;
4837 }4838
4839
4840 /*4841 * Process the encapsulated data4842 */4843
4844 if(tcp_data(skb,sk, saddr, len))
4845 {4846 kfree_skb(skb, FREE_READ);
4847 release_sock(sk);
4848 return 0;
4849 }4850
4851 /*4852 * And done4853 */4854
4855 release_sock(sk);
4856 return 0;
4857 }4858
4859 /*4860 * This routine sends a packet with an out of date sequence4861 * number. It assumes the other end will try to ack it.4862 */4863
4864 staticvoidtcp_write_wakeup(structsock *sk)
/* */4865 {4866 structsk_buff *buff,*skb;
4867 structtcphdr *t1;
4868 structdevice *dev=NULL;
4869 inttmp;
4870
4871 if (sk->zapped)
4872 return; /* After a valid reset we can send no more */4873
4874 /*4875 * Write data can still be transmitted/retransmitted in the4876 * following states. If any other state is encountered, return.4877 * [listen/close will never occur here anyway]4878 */4879
4880 if (sk->state != TCP_ESTABLISHED &&
4881 sk->state != TCP_CLOSE_WAIT &&
4882 sk->state != TCP_FIN_WAIT1 &&
4883 sk->state != TCP_LAST_ACK &&
4884 sk->state != TCP_CLOSING4885 )
4886 {4887 return;
4888 }4889
4890 if (before(sk->sent_seq, sk->window_seq) &&
4891 (skb=skb_peek(&sk->write_queue)))
4892 {4893 /*4894 * We are probing the opening of a window4895 * but the window size is != 04896 * must have been a result SWS advoidance ( sender )4897 */4898
4899 structiphdr *iph;
4900 structtcphdr *th;
4901 structtcphdr *nth;
4902 unsignedlongwin_size, ow_size;
4903 void * tcp_data_start;
4904
4905 win_size = sk->window_seq - sk->sent_seq;
4906
4907 iph = (structiphdr *)(skb->data + skb->dev->hard_header_len);
4908 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
4909
4910 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 +
4911 (iph->ihl << 2) +
4912 skb->dev->hard_header_len,
4913 1, GFP_ATOMIC);
4914 if ( buff == NULL )
4915 return;
4916
4917 buff->len = 0;
4918
4919 /* 4920 * If we strip the packet on the write queue we must4921 * be ready to retransmit this one 4922 */4923
4924 buff->free = 0;
4925
4926 buff->sk = sk;
4927 buff->localroute = sk->localroute;
4928
4929 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4930 IPPROTO_TCP, sk->opt, buff->mem_len,
4931 sk->ip_tos,sk->ip_ttl);
4932 if (tmp < 0)
4933 {4934 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4935 return;
4936 }4937
4938 buff->len += tmp;
4939 buff->dev = dev;
4940
4941 nth = (structtcphdr *) (buff->data + buff->len);
4942 buff->len += th->doff * 4;
4943
4944 memcpy(nth, th, th->doff * 4);
4945
4946 nth->ack = 1;
4947 nth->ack_seq = ntohl(sk->acked_seq);
4948 nth->window = ntohs(tcp_select_window(sk));
4949 nth->check = 0;
4950
4951 tcp_data_start = skb->data + skb->dev->hard_header_len +
4952 (iph->ihl << 2) + th->doff * 4;
4953
4954 memcpy(buff->data + buff->len, tcp_data_start, win_size);
4955 buff->len += win_size;
4956 buff->h.seq = sk->sent_seq + win_size;
4957
4958 /*4959 * now: shrink the queue head segment 4960 */4961
4962 th->check = 0;
4963 ow_size = skb->len - win_size -
4964 ((unsignedlong) (tcp_data_start - (void *) skb->data));
4965
4966 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
4967 skb->len -= win_size;
4968 sk->sent_seq += win_size;
4969 th->seq = htonl(sk->sent_seq);
4970
4971 if (th->urg)
4972 {4973 unsignedshorturg_ptr;
4974
4975 urg_ptr = ntohs(th->urg_ptr);
4976 if (urg_ptr <= win_size)
4977 th->urg = 0;
4978 else4979 {4980 urg_ptr -= win_size;
4981 th->urg_ptr = htons(urg_ptr);
4982 nth->urg_ptr = htons(win_size);
4983 }4984 }4985
4986 tcp_send_check(nth, sk->saddr, sk->daddr,
4987 nth->doff * 4 + win_size , sk);
4988 }4989 else4990 {4991 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4992 if (buff == NULL)
4993 return;
4994
4995 buff->len = sizeof(structtcphdr);
4996 buff->free = 1;
4997 buff->sk = sk;
4998 buff->localroute = sk->localroute;
4999
5000 t1 = (structtcphdr *) buff->data;
5001
5002 /*5003 * Put in the IP header and routing stuff. 5004 */5005
5006 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5007 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5008 if (tmp < 0)
5009 {5010 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
5011 return;
5012 }5013
5014 buff->len += tmp;
5015 t1 = (structtcphdr *)((char *)t1 +tmp);
5016
5017 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5018
5019 /*5020 * Use a previous sequence.5021 * This should cause the other end to send an ack.5022 */5023
5024 t1->seq = htonl(sk->sent_seq-1);
5025 t1->ack = 1;
5026 t1->res1= 0;
5027 t1->res2= 0;
5028 t1->rst = 0;
5029 t1->urg = 0;
5030 t1->psh = 0;
5031 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */5032 t1->syn = 0;
5033 t1->ack_seq = ntohl(sk->acked_seq);
5034 t1->window = ntohs(tcp_select_window(sk));
5035 t1->doff = sizeof(*t1)/4;
5036 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5037
5038 }5039
5040 /*5041 * Send it.5042 */5043
5044 sk->prot->queue_xmit(sk, dev, buff, 1);
5045 tcp_statistics.TcpOutSegs++;
5046 }5047
5048 /*5049 * A window probe timeout has occurred.5050 */5051
5052 voidtcp_send_probe0(structsock *sk)
/* */5053 {5054 if (sk->zapped)
5055 return; /* After a valid reset we can send no more */5056
5057 tcp_write_wakeup(sk);
5058
5059 sk->backoff++;
5060 sk->rto = min(sk->rto << 1, 120*HZ);
5061 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5062 sk->retransmits++;
5063 sk->prot->retransmits ++;
5064 }5065
5066 /*5067 * Socket option code for TCP. 5068 */5069
5070 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5071 {5072 intval,err;
5073
5074 if(level!=SOL_TCP)
5075 returnip_setsockopt(sk,level,optname,optval,optlen);
5076
5077 if (optval == NULL)
5078 return(-EINVAL);
5079
5080 err=verify_area(VERIFY_READ, optval, sizeof(int));
5081 if(err)
5082 returnerr;
5083
5084 val = get_fs_long((unsignedlong *)optval);
5085
5086 switch(optname)
5087 {5088 caseTCP_MAXSEG:
5089 /*5090 * values greater than interface MTU won't take effect. however at5091 * the point when this call is done we typically don't yet know5092 * which interface is going to be used5093 */5094 if(val<1||val>MAX_WINDOW)
5095 return -EINVAL;
5096 sk->user_mss=val;
5097 return 0;
5098 caseTCP_NODELAY:
5099 sk->nonagle=(val==0)?0:1;
5100 return 0;
5101 default:
5102 return(-ENOPROTOOPT);
5103 }5104 }5105
5106 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5107 {5108 intval,err;
5109
5110 if(level!=SOL_TCP)
5111 returnip_getsockopt(sk,level,optname,optval,optlen);
5112
5113 switch(optname)
5114 {5115 caseTCP_MAXSEG:
5116 val=sk->user_mss;
5117 break;
5118 caseTCP_NODELAY:
5119 val=sk->nonagle;
5120 break;
5121 default:
5122 return(-ENOPROTOOPT);
5123 }5124 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5125 if(err)
5126 returnerr;
5127 put_fs_long(sizeof(int),(unsignedlong *) optlen);
5128
5129 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5130 if(err)
5131 returnerr;
5132 put_fs_long(val,(unsignedlong *)optval);
5133
5134 return(0);
5135 }5136
5137
5138 structprototcp_prot = {5139 sock_wmalloc,
5140 sock_rmalloc,
5141 sock_wfree,
5142 sock_rfree,
5143 sock_rspace,
5144 sock_wspace,
5145 tcp_close,
5146 tcp_read,
5147 tcp_write,
5148 tcp_sendto,
5149 tcp_recvfrom,
5150 ip_build_header,
5151 tcp_connect,
5152 tcp_accept,
5153 ip_queue_xmit,
5154 tcp_retransmit,
5155 tcp_write_wakeup,
5156 tcp_read_wakeup,
5157 tcp_rcv,
5158 tcp_select,
5159 tcp_ioctl,
5160 NULL,
5161 tcp_shutdown,
5162 tcp_setsockopt,
5163 tcp_getsockopt,
5164 128,
5165 0,
5166 "TCP",
5167 0, 0,
5168 {NULL,}5169 };