1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@no.unit.nvg> 20 * 21 * Fixes: 22 * Alan Cox : Numerous verify_area() calls 23 * Alan Cox : Set the ACK bit on a reset 24 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 25 * and was trying to connect (tcp_err()). 26 * Alan Cox : All icmp error handling was broken 27 * pointers passed where wrong and the 28 * socket was looked up backwards. Nobody 29 * tested any icmp error code obviously. 30 * Alan Cox : tcp_err() now handled properly. It wakes people 31 * on errors. select behaves and the icmp error race 32 * has gone by moving it into sock.c 33 * Alan Cox : tcp_reset() fixed to work for everything not just 34 * packets for unknown sockets. 35 * Alan Cox : tcp option processing. 36 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] 37 * Herp Rosmanith : More reset fixes 38 * Alan Cox : No longer acks invalid rst frames. Acking 39 * any kind of RST is right out. 40 * Alan Cox : Sets an ignore me flag on an rst receive 41 * otherwise odd bits of prattle escape still 42 * Alan Cox : Fixed another acking RST frame bug. Should stop 43 * LAN workplace lockups. 44 * Alan Cox : Some tidyups using the new skb list facilities 45 * Alan Cox : sk->keepopen now seems to work 46 * Alan Cox : Pulls options out correctly on accepts 47 * Alan Cox : Fixed assorted sk->rqueue->next errors 48 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. 49 * Alan Cox : Tidied tcp_data to avoid a potential nasty. 50 * Alan Cox : Added some better commenting, as the tcp is hard to follow 51 * Alan Cox : Removed incorrect check for 20 * psh 52 * Michael O'Reilly : ack < copied bug fix. 53 * Johannes Stille : Misc tcp fixes (not all in yet). 54 * Alan Cox : FIN with no memory -> CRASH 55 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. 56 * Alan Cox : Added TCP options (SOL_TCP) 57 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. 58 * Alan Cox : Use ip_tos/ip_ttl settings. 59 * Alan Cox : Handle FIN (more) properly (we hope). 60 * Alan Cox : RST frames sent on unsynchronised state ack error/ 61 * Alan Cox : Put in missing check for SYN bit. 62 * Alan Cox : Added tcp_select_window() aka NET2E 63 * window non shrink trick. 64 * Alan Cox : Added a couple of small NET2E timer fixes 65 * Charles Hedrick : TCP fixes 66 * Toomas Tamm : TCP window fixes 67 * Alan Cox : Small URG fix to rlogin ^C ack fight 68 * Charles Hedrick : Rewrote most of it to actually work 69 * Linus : Rewrote tcp_read() and URG handling 70 * completely 71 * Gerhard Koerting: Fixed some missing timer handling 72 * Matthew Dillon : Reworked TCP machine states as per RFC 73 * Gerhard Koerting: PC/TCP workarounds 74 * Adam Caldwell : Assorted timer/timing errors 75 * Matthew Dillon : Fixed another RST bug 76 * Alan Cox : Move to kernel side addressing changes. 77 * Alan Cox : Beginning work on TCP fastpathing (not yet usable) 78 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 79 * Alan Cox : TCP fast path debugging 80 * Alan Cox : Window clamping 81 * Michael Riepe : Bug in tcp_check() 82 * Matt Dillon : More TCP improvements and RST bug fixes 83 * Matt Dillon : Yet more small nasties remove from the TCP code 84 * (Be very nice to this man if tcp finally works 100%) 8) 85 * Alan Cox : BSD accept semantics. 86 * Alan Cox : Reset on closedown bug. 87 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 88 * Michael Pall : Handle select() after URG properly in all cases. 89 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). 90 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. 91 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. 92 * Alan Cox : Changed the semantics of sk->socket to 93 * fix a race and a signal problem with 94 * accept() and async I/O. 95 * Alan Cox : Relaxed the rules on tcp_sendto(). 96 * Yury Shevchuk : Really fixed accept() blocking problem. 97 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 98 * clients/servers which listen in on 99 * fixed ports. 100 * Alan Cox : Cleaned the above up and shrank it to 101 * a sensible code size. 102 * Alan Cox : Self connect lockup fix. 103 * Alan Cox : No connect to multicast. 104 * Ross Biro : Close unaccepted children on master 105 * socket close. 106 * Alan Cox : Reset tracing code. 107 * Alan Cox : Spurious resets on shutdown. 108 * Alan Cox : Giant 15 minute/60 second timer error 109 * Alan Cox : Small whoops in selecting before an accept. 110 * Alan Cox : Kept the state trace facility since it's 111 * handy for debugging. 112 * Alan Cox : More reset handler fixes. 113 * Alan Cox : Started rewriting the code based on the RFC's 114 * for other useful protocol references see: 115 * Comer, KA9Q NOS, and for a reference on the 116 * difference between specifications and how BSD 117 * works see the 4.4lite source. 118 * A.N.Kuznetsov : Don't time wait on completion of tidy 119 * close. 120 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 121 * Linus Torvalds : Fixed BSD port reuse to work first syn 122 * Alan Cox : Reimplemented timers as per the RFC and using multiple 123 * timers for sanity. 124 * Alan Cox : Small bug fixes, and a lot of new 125 * comments. 126 * Alan Cox : Fixed dual reader crash by locking 127 * the buffers (much like datagram.c) 128 * Alan Cox : Fixed stuck sockets in probe. A probe 129 * now gets fed up of retrying without 130 * (even a no space) answer. 131 * Alan Cox : Extracted closing code better 132 * Alan Cox : Fixed the closing state machine to 133 * resemble the RFC. 134 * Alan Cox : More 'per spec' fixes. 135 * Alan Cox : tcp_data() doesn't ack illegal PSH 136 * only frames. At least one pc tcp stack 137 * generates them. 138 * Mark Yarvis : In tcp_read_wakeup(), don't send an 139 * ack if stat is TCP_CLOSED. 140 * 141 * 142 * To Fix: 143 * Fast path the code. Two things here - fix the window calculation 144 * so it doesn't iterate over the queue, also spot packets with no funny 145 * options arriving in order and process directly. 146 * 147 * Implement RFC 1191 [Path MTU discovery] 148 * Look at the effect of implementing RFC 1337 suggestions and their impact. 149 * Rewrite output state machine to use a single queue and do low window 150 * situations as per the spec (RFC 1122) 151 * Speed up input assembly algorithm. 152 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 153 * could do with it working on IPv4 154 * User settable/learned rtt/max window/mtu 155 * Cope with MTU/device switches when retransmitting in tcp. 156 * Fix the window handling to use PR's new code. 157 * 158 * Change the fundamental structure to a single send queue maintained 159 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 160 * active routes too]). Cut the queue off in tcp_retransmit/ 161 * tcp_transmit. 162 * Change the receive queue to assemble as it goes. This lets us 163 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 164 * tcp_data/tcp_read as well as the window shrink crud. 165 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 166 * tcp_queue_skb seem obvious routines to extract. 167 * 168 * This program is free software; you can redistribute it and/or 169 * modify it under the terms of the GNU General Public License 170 * as published by the Free Software Foundation; either version 171 * 2 of the License, or(at your option) any later version. 172 * 173 * Description of States: 174 * 175 * TCP_SYN_SENT sent a connection request, waiting for ack 176 * 177 * TCP_SYN_RECV received a connection request, sent ack, 178 * waiting for final ack in three-way handshake. 179 * 180 * TCP_ESTABLISHED connection established 181 * 182 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 183 * transmission of remaining buffered data 184 * 185 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 186 * to shutdown 187 * 188 * TCP_CLOSING both sides have shutdown but we still have 189 * data we have to finish sending 190 * 191 * TCP_TIME_WAIT timeout to catch resent junk before entering 192 * closed, can only be entered from FIN_WAIT2 193 * or CLOSING. Required because the other end 194 * may not have gotten our last ACK causing it 195 * to retransmit the data packet (which we ignore) 196 * 197 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 198 * us to finish writing our data and to shutdown 199 * (we have to close() to move on to LAST_ACK) 200 * 201 * TCP_LAST_ACK out side has shutdown after remote has 202 * shutdown. There may still be data in our 203 * buffer that we have to finish sending 204 * 205 * TCP_CLOSE socket is finished 206 */ 207
208 #include <linux/types.h>
209 #include <linux/sched.h>
210 #include <linux/mm.h>
211 #include <linux/time.h>
212 #include <linux/string.h>
213 #include <linux/config.h>
214 #include <linux/socket.h>
215 #include <linux/sockios.h>
216 #include <linux/termios.h>
217 #include <linux/in.h>
218 #include <linux/fcntl.h>
219 #include <linux/inet.h>
220 #include <linux/netdevice.h>
221 #include "snmp.h"
222 #include "ip.h"
223 #include "protocol.h"
224 #include "icmp.h"
225 #include "tcp.h"
226 #include "arp.h"
227 #include <linux/skbuff.h>
228 #include "sock.h"
229 #include "route.h"
230 #include <linux/errno.h>
231 #include <linux/timer.h>
232 #include <asm/system.h>
233 #include <asm/segment.h>
234 #include <linux/mm.h>
235
236 /* 237 * The MSL timer is the 'normal' timer. 238 */ 239
240 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
241
242 #define SEQ_TICK 3
243 unsignedlongseq_offset;
244 structtcp_mibtcp_statistics;
245
246 staticvoidtcp_close(structsock *sk, inttimeout);
247
248
249 /* 250 * The less said about this the better, but it works and will do for 1.2 251 */ 252
253 staticstructwait_queue *master_select_wakeup;
254
255 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 256 { 257 if (a < b)
258 return(a);
259 return(b);
260 } 261
262 #undefSTATE_TRACE 263
264 #ifdefSTATE_TRACE 265 staticchar *statename[]={ 266 "Unused","Established","Syn Sent","Syn Recv",
267 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
268 "Close Wait","Last ACK","Listen","Closing"
269 };
270 #endif 271
272 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 273 { 274 if(sk->state==TCP_ESTABLISHED)
275 tcp_statistics.TcpCurrEstab--;
276 #ifdefSTATE_TRACE 277 if(sk->debug)
278 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
279 #endif 280 /* This is a hack but it doesn't occur often and it's going to 281 be a real to fix nicely */ 282
283 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
284 { 285 wake_up_interruptible(&master_select_wakeup);
286 } 287 sk->state=state;
288 if(state==TCP_ESTABLISHED)
289 tcp_statistics.TcpCurrEstab++;
290 } 291
292 /* 293 * This routine picks a TCP windows for a socket based on 294 * the following constraints 295 * 296 * 1. The window can never be shrunk once it is offered (RFC 793) 297 * 2. We limit memory per socket 298 * 299 * For now we use NET2E3's heuristic of offering half the memory 300 * we have handy. All is not as bad as this seems however because 301 * of two things. Firstly we will bin packets even within the window 302 * in order to get the data we are waiting for into the memory limit. 303 * Secondly we bin common duplicate forms at receive time 304 * Better heuristics welcome 305 */ 306
307 inttcp_select_window(structsock *sk)
/* */ 308 { 309 intnew_window = sk->prot->rspace(sk);
310
311 if(sk->window_clamp)
312 new_window=min(sk->window_clamp,new_window);
313 /* 314 * Two things are going on here. First, we don't ever offer a 315 * window less than min(sk->mss, MAX_WINDOW/2). This is the 316 * receiver side of SWS as specified in RFC1122. 317 * Second, we always give them at least the window they 318 * had before, in order to avoid retracting window. This 319 * is technically allowed, but RFC1122 advises against it and 320 * in practice it causes trouble. 321 * 322 * Fixme: This doesn't correctly handle the case where 323 * new_window > sk->window but not by enough to allow for the 324 * shift in sequence space. 325 */ 326 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
327 return(sk->window);
328 return(new_window);
329 } 330
331 /* 332 * Find someone to 'accept'. Must be called with 333 * sk->inuse=1 or cli() 334 */ 335
336 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 337 { 338 structsk_buff *p=skb_peek(&s->receive_queue);
339 if(p==NULL)
340 returnNULL;
341 do 342 { 343 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
344 returnp;
345 p=p->next;
346 } 347 while(p!=(structsk_buff *)&s->receive_queue);
348 returnNULL;
349 } 350
351 /* 352 * Remove a completed connection and return it. This is used by 353 * tcp_accept() to get connections from the queue. 354 */ 355
356 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 357 { 358 structsk_buff *skb;
359 unsignedlongflags;
360 save_flags(flags);
361 cli();
362 skb=tcp_find_established(s);
363 if(skb!=NULL)
364 skb_unlink(skb); /* Take it off the queue */ 365 restore_flags(flags);
366 returnskb;
367 } 368
369 /* 370 * This routine closes sockets which have been at least partially 371 * opened, but not yet accepted. Currently it is only called by 372 * tcp_close, and timeout mirrors the value there. 373 */ 374
375 staticvoidtcp_close_pending (structsock *sk)
/* */ 376 { 377 structsk_buff *skb;
378
379 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
380 { 381 skb->sk->dead=1;
382 tcp_close(skb->sk, 0);
383 kfree_skb(skb, FREE_READ);
384 } 385 return;
386 } 387
388 /* 389 * Enter the time wait state. 390 */ 391
392 staticvoidtcp_time_wait(structsock *sk)
/* */ 393 { 394 tcp_set_state(sk,TCP_TIME_WAIT);
395 sk->shutdown = SHUTDOWN_MASK;
396 if (!sk->dead)
397 sk->state_change(sk);
398 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
399 } 400
401 /* 402 * A socket has timed out on its send queue and wants to do a 403 * little retransmitting. Currently this means TCP. 404 */ 405
406 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 407 { 408 structsk_buff * skb;
409 structproto *prot;
410 structdevice *dev;
411 intct=0;
412
413 prot = sk->prot;
414 skb = sk->send_head;
415
416 while (skb != NULL)
417 { 418 structtcphdr *th;
419 structiphdr *iph;
420 intsize;
421
422 dev = skb->dev;
423 IS_SKB(skb);
424 skb->when = jiffies;
425
426 /* 427 * In general it's OK just to use the old packet. However we 428 * need to use the current ack and window fields. Urg and 429 * urg_ptr could possibly stand to be updated as well, but we 430 * don't keep the necessary data. That shouldn't be a problem, 431 * if the other end is doing the right thing. Since we're 432 * changing the packet, we have to issue a new IP identifier. 433 */ 434
435 iph = (structiphdr *)(skb->data + dev->hard_header_len);
436 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
437 size = skb->len - (((unsignedchar *) th) - skb->data);
438
439 /* 440 * Note: We ought to check for window limits here but 441 * currently this is done (less efficiently) elsewhere. 442 * We do need to check for a route change but can't handle 443 * that until we have the new 1.3.x buffers in. 444 * 445 */ 446
447 iph->id = htons(ip_id_count++);
448 ip_send_check(iph);
449
450 /* 451 * This is not the right way to handle this. We have to 452 * issue an up to date window and ack report with this 453 * retransmit to keep the odd buggy tcp that relies on 454 * the fact BSD does this happy. 455 * We don't however need to recalculate the entire 456 * checksum, so someone wanting a small problem to play 457 * with might like to implement RFC1141/RFC1624 and speed 458 * this up by avoiding a full checksum. 459 */ 460
461 th->ack_seq = ntohl(sk->acked_seq);
462 th->window = ntohs(tcp_select_window(sk));
463 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
464
465 /* 466 * If the interface is (still) up and running, kick it. 467 */ 468
469 if (dev->flags & IFF_UP)
470 { 471 /* 472 * If the packet is still being sent by the device/protocol 473 * below then don't retransmit. This is both needed, and good - 474 * especially with connected mode AX.25 where it stops resends 475 * occurring of an as yet unsent anyway frame! 476 * We still add up the counts as the round trip time wants 477 * adjusting. 478 */ 479 if (sk && !skb_device_locked(skb))
480 { 481 /* Remove it from any existing driver queue first! */ 482 skb_unlink(skb);
483 /* Now queue it */ 484 ip_statistics.IpOutRequests++;
485 dev_queue_xmit(skb, dev, sk->priority);
486 } 487 } 488
489 /* 490 * Count retransmissions 491 */ 492
493 ct++;
494 sk->prot->retransmits ++;
495
496 /* 497 * Only one retransmit requested. 498 */ 499
500 if (!all)
501 break;
502
503 /* 504 * This should cut it off before we send too many packets. 505 */ 506
507 if (ct >= sk->cong_window)
508 break;
509 skb = skb->link3;
510 } 511 } 512
513 /* 514 * Reset the retransmission timer 515 */ 516
517 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 518 { 519 del_timer(&sk->retransmit_timer);
520 sk->ip_xmit_timeout = why;
521 if((int)when < 0)
522 { 523 when=3;
524 printk("Error: Negative timer in xmit_timer\n");
525 } 526 sk->retransmit_timer.expires=when;
527 add_timer(&sk->retransmit_timer);
528 } 529
530 /* 531 * This is the normal code called for timeouts. It does the retransmission 532 * and then does backoff. tcp_do_retransmit is separated out because 533 * tcp_ack needs to send stuff from the retransmit queue without 534 * initiating a backoff. 535 */ 536
537
538 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 539 { 540 tcp_do_retransmit(sk, all);
541
542 /* 543 * Increase the timeout each time we retransmit. Note that 544 * we do not increase the rtt estimate. rto is initialized 545 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 546 * that doubling rto each time is the least we can get away with. 547 * In KA9Q, Karn uses this for the first few times, and then 548 * goes to quadratic. netBSD doubles, but only goes up to *64, 549 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 550 * defined in the protocol as the maximum possible RTT. I guess 551 * we'll have to use something other than TCP to talk to the 552 * University of Mars. 553 * 554 * PAWS allows us longer timeouts and large windows, so once 555 * implemented ftp to mars will work nicely. We will have to fix 556 * the 120 second clamps though! 557 */ 558
559 sk->retransmits++;
560 sk->backoff++;
561 sk->rto = min(sk->rto << 1, 120*HZ);
562 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
563 } 564
565
566 /* 567 * A timer event has trigger a tcp retransmit timeout. The 568 * socket xmit queue is ready and set up to send. Because 569 * the ack receive code keeps the queue straight we do 570 * nothing clever here. 571 */ 572
573 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 574 { 575 if (all)
576 { 577 tcp_retransmit_time(sk, all);
578 return;
579 } 580
581 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 582 /* sk->ssthresh in theory can be zero. I guess that's OK */ 583 sk->cong_count = 0;
584
585 sk->cong_window = 1;
586
587 /* Do the actual retransmit. */ 588 tcp_retransmit_time(sk, all);
589 } 590
591 /* 592 * A write timeout has occurred. Process the after effects. 593 */ 594
595 staticinttcp_write_timeout(structsock *sk)
/* */ 596 { 597 /* 598 * Look for a 'soft' timeout. 599 */ 600 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
601 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
602 { 603 /* 604 * Attempt to recover if arp has changed (unlikely!) or 605 * a route has shifted (not supported prior to 1.3). 606 */ 607 arp_destroy (sk->daddr, 0);
608 ip_route_check (sk->daddr);
609 } 610 /* 611 * Has it gone just too far ? 612 */ 613 if (sk->retransmits > TCP_RETR2)
614 { 615 sk->err = ETIMEDOUT;
616 sk->error_report(sk);
617 del_timer(&sk->retransmit_timer);
618 /* 619 * Time wait the socket 620 */ 621 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
622 { 623 tcp_set_state(sk,TCP_TIME_WAIT);
624 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
625 } 626 else 627 { 628 /* 629 * Clean up time. 630 */ 631 tcp_set_state(sk, TCP_CLOSE);
632 return 0;
633 } 634 } 635 return 1;
636 } 637
638 /* 639 * The TCP retransmit timer. This lacks a few small details. 640 * 641 * 1. An initial rtt timeout on the probe0 should cause what we can 642 * of the first write queue buffer to be split and sent. 643 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 644 * ETIMEDOUT if we know an additional 'soft' error caused this. 645 * tcp_err should save a 'soft error' for us. 646 */ 647
648 staticvoidretransmit_timer(unsignedlongdata)
/* */ 649 { 650 structsock *sk = (structsock*)data;
651 intwhy = sk->ip_xmit_timeout;
652
653 /* 654 * only process if socket is not in use 655 */ 656
657 cli();
658 if (sk->inuse || in_bh)
659 { 660 /* Try again in 1 second */ 661 sk->retransmit_timer.expires = HZ;
662 add_timer(&sk->retransmit_timer);
663 sti();
664 return;
665 } 666
667 sk->inuse = 1;
668 sti();
669
670 /* Always see if we need to send an ack. */ 671
672 if (sk->ack_backlog && !sk->zapped)
673 { 674 sk->prot->read_wakeup (sk);
675 if (! sk->dead)
676 sk->data_ready(sk,0);
677 } 678
679 /* Now we need to figure out why the socket was on the timer. */ 680
681 switch (why)
682 { 683 /* Window probing */ 684 caseTIME_PROBE0:
685 tcp_send_probe0(sk);
686 tcp_write_timeout(sk);
687 break;
688 /* Retransmitting */ 689 caseTIME_WRITE:
690 /* It could be we got here because we needed to send an ack. 691 * So we need to check for that. 692 */ 693 { 694 structsk_buff *skb;
695 unsignedlongflags;
696
697 save_flags(flags);
698 cli();
699 skb = sk->send_head;
700 if (!skb)
701 { 702 restore_flags(flags);
703 } 704 else 705 { 706 /* 707 * Kicked by a delayed ack. Reset timer 708 * correctly now 709 */ 710 if (jiffies < skb->when + sk->rto)
711 { 712 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
713 restore_flags(flags);
714 break;
715 } 716 restore_flags(flags);
717 /* 718 * Retransmission 719 */ 720 sk->prot->retransmit (sk, 0);
721 tcp_write_timeout(sk);
722 } 723 break;
724 } 725 /* Sending Keepalives */ 726 caseTIME_KEEPOPEN:
727 /* 728 * this reset_timer() call is a hack, this is not 729 * how KEEPOPEN is supposed to work. 730 */ 731 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
732
733 /* Send something to keep the connection open. */ 734 if (sk->prot->write_wakeup)
735 sk->prot->write_wakeup (sk);
736 sk->retransmits++;
737 tcp_write_timeout(sk);
738 break;
739 default:
740 printk ("rexmit_timer: timer expired - reason unknown\n");
741 break;
742 } 743 release_sock(sk);
744 } 745
746 /* 747 * This routine is called by the ICMP module when it gets some 748 * sort of error condition. If err < 0 then the socket should 749 * be closed and the error returned to the user. If err > 0 750 * it's just the icmp type << 8 | icmp code. After adjustment 751 * header points to the first 8 bytes of the tcp header. We need 752 * to find the appropriate port. 753 */ 754
755 voidtcp_err(interr, unsignedchar *header, unsignedlongdaddr,
/* */ 756 unsignedlongsaddr, structinet_protocol *protocol)
757 { 758 structtcphdr *th;
759 structsock *sk;
760 structiphdr *iph=(structiphdr *)header;
761
762 header+=4*iph->ihl;
763
764
765 th =(structtcphdr *)header;
766 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
767
768 if (sk == NULL)
769 return;
770
771 if(err<0)
772 { 773 sk->err = -err;
774 sk->error_report(sk);
775 return;
776 } 777
778 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
779 { 780 /* 781 * FIXME: 782 * For now we will just trigger a linear backoff. 783 * The slow start code should cause a real backoff here. 784 */ 785 if (sk->cong_window > 4)
786 sk->cong_window--;
787 return;
788 } 789
790 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ 791
792 /* 793 * If we've already connected we will keep trying 794 * until we time out, or the user gives up. 795 */ 796
797 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
798 { 799 if (sk->state == TCP_SYN_SENT)
800 { 801 tcp_statistics.TcpAttemptFails++;
802 tcp_set_state(sk,TCP_CLOSE);
803 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 804 } 805 sk->err = icmp_err_convert[err & 0xff].errno;
806 } 807 return;
808 } 809
810
811 /* 812 * Walk down the receive queue counting readable data until we hit the end or we find a gap 813 * in the received data queue (ie a frame missing that needs sending to us). Not 814 * sorting using two queues as data arrives makes life so much harder. 815 */ 816
817 staticinttcp_readable(structsock *sk)
/* */ 818 { 819 unsignedlongcounted;
820 unsignedlongamount;
821 structsk_buff *skb;
822 intsum;
823 unsignedlongflags;
824
825 if(sk && sk->debug)
826 printk("tcp_readable: %p - ",sk);
827
828 save_flags(flags);
829 cli();
830 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
831 { 832 restore_flags(flags);
833 if(sk && sk->debug)
834 printk("empty\n");
835 return(0);
836 } 837
838 counted = sk->copied_seq; /* Where we are at the moment */ 839 amount = 0;
840
841 /* 842 * Do until a push or until we are out of data. 843 */ 844
845 do 846 { 847 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ 848 break;
849 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 850 if (skb->h.th->syn)
851 sum++;
852 if (sum > 0)
853 {/* Add it up, move on */ 854 amount += sum;
855 if (skb->h.th->syn)
856 amount--;
857 counted += sum;
858 } 859 /* 860 * Don't count urg data ... but do it in the right place! 861 * Consider: "old_data (ptr is here) URG PUSH data" 862 * The old code would stop at the first push because 863 * it counted the urg (amount==1) and then does amount-- 864 * *after* the loop. This means tcp_readable() always 865 * returned zero if any URG PUSH was in the queue, even 866 * though there was normal data available. If we subtract 867 * the urg data right here, we even get it to work for more 868 * than one URG PUSH skb without normal data. 869 * This means that select() finally works now with urg data 870 * in the queue. Note that rlogin was never affected 871 * because it doesn't use select(); it uses two processes 872 * and a blocking read(). And the queue scan in tcp_read() 873 * was correct. Mike <pall@rz.uni-karlsruhe.de> 874 */ 875 if (skb->h.th->urg)
876 amount--; /* don't count urg data */ 877 if (amount && skb->h.th->psh) break;
878 skb = skb->next;
879 } 880 while(skb != (structsk_buff *)&sk->receive_queue);
881
882 restore_flags(flags);
883 if(sk->debug)
884 printk("got %lu bytes.\n",amount);
885 return(amount);
886 } 887
888 /* 889 * LISTEN is a special case for select.. 890 */ 891 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 892 { 893 if (sel_type == SEL_IN) { 894 intretval;
895
896 sk->inuse = 1;
897 retval = (tcp_find_established(sk) != NULL);
898 release_sock(sk);
899 if (!retval)
900 select_wait(&master_select_wakeup,wait);
901 returnretval;
902 } 903 return 0;
904 } 905
906
907 /* 908 * Wait for a TCP event. 909 * 910 * Note that we don't need to set "sk->inuse", as the upper select layers 911 * take care of normal races (between the test and the event) and we don't 912 * go look at any of the socket buffers directly. 913 */ 914 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 915 { 916 if (sk->state == TCP_LISTEN)
917 returntcp_listen_select(sk, sel_type, wait);
918
919 switch(sel_type) { 920 caseSEL_IN:
921 if (sk->err)
922 return 1;
923 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
924 break;
925
926 if (sk->shutdown & RCV_SHUTDOWN)
927 return 1;
928
929 if (sk->acked_seq == sk->copied_seq)
930 break;
931
932 if (sk->urg_seq != sk->copied_seq ||
933 sk->acked_seq != sk->copied_seq+1 ||
934 sk->urginline || !sk->urg_data)
935 return 1;
936 break;
937
938 caseSEL_OUT:
939 if (sk->err)
940 return 1;
941 if (sk->shutdown & SEND_SHUTDOWN)
942 return 0;
943 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
944 break;
945 /* 946 * This is now right thanks to a small fix 947 * by Matt Dillon. 948 */ 949
950 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
951 break;
952 return 1;
953
954 caseSEL_EX:
955 if (sk->urg_data)
956 return 1;
957 break;
958 } 959 select_wait(sk->sleep, wait);
960 return 0;
961 } 962
963 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 964 { 965 interr;
966 switch(cmd)
967 { 968
969 caseTIOCINQ:
970 #ifdef FIXME /* FIXME: */ 971 caseFIONREAD:
972 #endif 973 { 974 unsignedlongamount;
975
976 if (sk->state == TCP_LISTEN)
977 return(-EINVAL);
978
979 sk->inuse = 1;
980 amount = tcp_readable(sk);
981 release_sock(sk);
982 err=verify_area(VERIFY_WRITE,(void *)arg,
983 sizeof(unsignedlong));
984 if(err)
985 returnerr;
986 put_fs_long(amount,(unsignedlong *)arg);
987 return(0);
988 } 989 caseSIOCATMARK:
990 { 991 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
992
993 err = verify_area(VERIFY_WRITE,(void *) arg,
994 sizeof(unsignedlong));
995 if (err)
996 returnerr;
997 put_fs_long(answ,(int *) arg);
998 return(0);
999 }1000 caseTIOCOUTQ:
1001 {1002 unsignedlongamount;
1003
1004 if (sk->state == TCP_LISTEN) return(-EINVAL);
1005 amount = sk->prot->wspace(sk);
1006 err=verify_area(VERIFY_WRITE,(void *)arg,
1007 sizeof(unsignedlong));
1008 if(err)
1009 returnerr;
1010 put_fs_long(amount,(unsignedlong *)arg);
1011 return(0);
1012 }1013 default:
1014 return(-EINVAL);
1015 }1016 }1017
1018
1019 /*1020 * This routine computes a TCP checksum. 1021 */1022
1023 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1024 unsignedlongsaddr, unsignedlongdaddr)
1025 {1026 unsignedlongsum;
1027
1028 if (saddr == 0) saddr = ip_my_addr();
1029
1030 /*1031 * stupid, gcc complains when I use just one __asm__ block,1032 * something about too many reloads, but this is just two1033 * instructions longer than what I want1034 */1035 __asm__("
1036 addl %%ecx, %%ebx
1037 adcl %%edx, %%ebx
1038 adcl $0, %%ebx
1039 "
1040 : "=b"(sum)
1041 : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1042 : "bx", "cx", "dx" );
1043 __asm__("
1044 movl %%ecx, %%edx
1045 cld
1046 cmpl $32, %%ecx
1047 jb 2f
1048 shrl $5, %%ecx
1049 clc
1050 1: lodsl
1051 adcl %%eax, %%ebx
1052 lodsl
1053 adcl %%eax, %%ebx
1054 lodsl
1055 adcl %%eax, %%ebx
1056 lodsl
1057 adcl %%eax, %%ebx
1058 lodsl
1059 adcl %%eax, %%ebx
1060 lodsl
1061 adcl %%eax, %%ebx
1062 lodsl
1063 adcl %%eax, %%ebx
1064 lodsl
1065 adcl %%eax, %%ebx
1066 loop 1b
1067 adcl $0, %%ebx
1068 movl %%edx, %%ecx
1069 2: andl $28, %%ecx
1070 je 4f
1071 shrl $2, %%ecx
1072 clc
1073 3: lodsl
1074 adcl %%eax, %%ebx
1075 loop 3b
1076 adcl $0, %%ebx
1077 4: movl $0, %%eax
1078 testw $2, %%dx
1079 je 5f
1080 lodsw
1081 addl %%eax, %%ebx
1082 adcl $0, %%ebx
1083 movw $0, %%ax
1084 5: test $1, %%edx
1085 je 6f
1086 lodsb
1087 addl %%eax, %%ebx
1088 adcl $0, %%ebx
1089 6: movl %%ebx, %%eax
1090 shrl $16, %%eax
1091 addw %%ax, %%bx
1092 adcw $0, %%bx
1093 "
1094 : "=b"(sum)
1095 : "0"(sum), "c"(len), "S"(th)
1096 : "ax", "bx", "cx", "dx", "si" );
1097
1098 /* We only want the bottom 16 bits, but we never cleared the top 16. */1099
1100 return((~sum) & 0xffff);
1101 }1102
1103
1104
1105 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1106 unsignedlongdaddr, intlen, structsock *sk)
1107 {1108 th->check = 0;
1109 th->check = tcp_check(th, len, saddr, daddr);
1110 return;
1111 }1112
1113 /*1114 * This is the main buffer sending routine. We queue the buffer1115 * having checked it is sane seeming.1116 */1117
1118 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1119 {1120 intsize;
1121 structtcphdr * th = skb->h.th;
1122
1123 /*1124 * length of packet (not counting length of pre-tcp headers) 1125 */1126
1127 size = skb->len - ((unsignedchar *) th - skb->data);
1128
1129 /*1130 * Sanity check it.. 1131 */1132
1133 if (size < sizeof(structtcphdr) || size > skb->len)
1134 {1135 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1136 skb, skb->data, th, skb->len);
1137 kfree_skb(skb, FREE_WRITE);
1138 return;
1139 }1140
1141 /*1142 * If we have queued a header size packet.. (these crash a few1143 * tcp stacks if ack is not set)1144 */1145
1146 if (size == sizeof(structtcphdr))
1147 {1148 /* If it's got a syn or fin it's notionally included in the size..*/1149 if(!th->syn && !th->fin)
1150 {1151 printk("tcp_send_skb: attempt to queue a bogon.\n");
1152 kfree_skb(skb,FREE_WRITE);
1153 return;
1154 }1155 }1156
1157 /*1158 * Actual processing.1159 */1160
1161 tcp_statistics.TcpOutSegs++;
1162 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1163
1164 /*1165 * We must queue if1166 *1167 * a) The right edge of this frame exceeds the window1168 * b) We are retransmitting (Nagle's rule)1169 * c) We have too many packets 'in flight'1170 */1171
1172 if (after(skb->h.seq, sk->window_seq) ||
1173 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1174 sk->packets_out >= sk->cong_window)
1175 {1176 /* checksum will be supplied by tcp_write_xmit. So1177 * we shouldn't need to set it at all. I'm being paranoid */1178 th->check = 0;
1179 if (skb->next != NULL)
1180 {1181 printk("tcp_send_partial: next != NULL\n");
1182 skb_unlink(skb);
1183 }1184 skb_queue_tail(&sk->write_queue, skb);
1185
1186 /*1187 * If we don't fit we have to start the zero window1188 * probes. This is broken - we really need to do a partial1189 * send _first_ (This is what causes the Cisco and PC/TCP1190 * grief).1191 */1192
1193 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1194 sk->send_head == NULL && sk->ack_backlog == 0)
1195 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1196 }1197 else1198 {1199 /*1200 * This is going straight out1201 */1202
1203 th->ack_seq = ntohl(sk->acked_seq);
1204 th->window = ntohs(tcp_select_window(sk));
1205
1206 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1207
1208 sk->sent_seq = sk->write_seq;
1209
1210 /*1211 * This is mad. The tcp retransmit queue is put together1212 * by the ip layer. This causes half the problems with1213 * unroutable FIN's and other things.1214 */1215
1216 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1217
1218 /*1219 * Set for next retransmit based on expected ACK time.1220 * FIXME: We set this every time which means our 1221 * retransmits are really about a window behind.1222 */1223
1224 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1225 }1226 }1227
1228 /*1229 * Locking problems lead us to a messy situation where we can have1230 * multiple partially complete buffers queued up. This is really bad1231 * as we don't want to be sending partial buffers. Fix this with1232 * a semaphore or similar to lock tcp_write per socket.1233 *1234 * These routines are pretty self descriptive.1235 */1236
1237 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1238 {1239 structsk_buff * skb;
1240 unsignedlongflags;
1241
1242 save_flags(flags);
1243 cli();
1244 skb = sk->partial;
1245 if (skb) {1246 sk->partial = NULL;
1247 del_timer(&sk->partial_timer);
1248 }1249 restore_flags(flags);
1250 returnskb;
1251 }1252
1253 /*1254 * Empty the partial queue1255 */1256
1257 staticvoidtcp_send_partial(structsock *sk)
/* */1258 {1259 structsk_buff *skb;
1260
1261 if (sk == NULL)
1262 return;
1263 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1264 tcp_send_skb(sk, skb);
1265 }1266
1267 /*1268 * Queue a partial frame1269 */1270
1271 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1272 {1273 structsk_buff * tmp;
1274 unsignedlongflags;
1275
1276 save_flags(flags);
1277 cli();
1278 tmp = sk->partial;
1279 if (tmp)
1280 del_timer(&sk->partial_timer);
1281 sk->partial = skb;
1282 init_timer(&sk->partial_timer);
1283 /*1284 * Wait up to 1 second for the buffer to fill.1285 */1286 sk->partial_timer.expires = HZ;
1287 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1288 sk->partial_timer.data = (unsignedlong) sk;
1289 add_timer(&sk->partial_timer);
1290 restore_flags(flags);
1291 if (tmp)
1292 tcp_send_skb(sk, tmp);
1293 }1294
1295
1296 /*1297 * This routine sends an ack and also updates the window. 1298 */1299
1300 staticvoidtcp_send_ack(unsignedlongsequence, unsignedlongack,
/* */1301 structsock *sk,
1302 structtcphdr *th, unsignedlongdaddr)
1303 {1304 structsk_buff *buff;
1305 structtcphdr *t1;
1306 structdevice *dev = NULL;
1307 inttmp;
1308
1309 if(sk->zapped)
1310 return; /* We have been reset, we may not send again */1311
1312 /*1313 * We need to grab some memory, and put together an ack,1314 * and then put it into the queue to be sent.1315 */1316
1317 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1318 if (buff == NULL)
1319 {1320 /* 1321 * Force it to send an ack. We don't have to do this1322 * (ACK is unreliable) but it's much better use of 1323 * bandwidth on slow links to send a spare ack than1324 * resend packets. 1325 */1326
1327 sk->ack_backlog++;
1328 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1329 {1330 reset_xmit_timer(sk, TIME_WRITE, HZ);
1331 }1332 return;
1333 }1334
1335 /*1336 * Assemble a suitable TCP frame1337 */1338
1339 buff->len = sizeof(structtcphdr);
1340 buff->sk = sk;
1341 buff->localroute = sk->localroute;
1342 t1 =(structtcphdr *) buff->data;
1343
1344 /* 1345 * Put in the IP header and routing stuff. 1346 */1347
1348 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1349 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1350 if (tmp < 0)
1351 {1352 buff->free = 1;
1353 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1354 return;
1355 }1356 buff->len += tmp;
1357 t1 =(structtcphdr *)((char *)t1 +tmp);
1358
1359 memcpy(t1, th, sizeof(*t1));
1360
1361 /*1362 * Swap the send and the receive. 1363 */1364
1365 t1->dest = th->source;
1366 t1->source = th->dest;
1367 t1->seq = ntohl(sequence);
1368 t1->ack = 1;
1369 sk->window = tcp_select_window(sk);
1370 t1->window = ntohs(sk->window);
1371 t1->res1 = 0;
1372 t1->res2 = 0;
1373 t1->rst = 0;
1374 t1->urg = 0;
1375 t1->syn = 0;
1376 t1->psh = 0;
1377 t1->fin = 0;
1378
1379 /*1380 * If we have nothing queued for transmit and the transmit timer1381 * is on we are just doing an ACK timeout and need to switch1382 * to a keepalive.1383 */1384
1385 if (ack == sk->acked_seq)
1386 {1387 sk->ack_backlog = 0;
1388 sk->bytes_rcv = 0;
1389 sk->ack_timed = 0;
1390 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1391 && sk->ip_xmit_timeout == TIME_WRITE)
1392 {1393 if(sk->keepopen) {1394 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1395 }else{1396 delete_timer(sk);
1397 }1398 }1399 }1400
1401 /*1402 * Fill in the packet and send it1403 */1404
1405 t1->ack_seq = ntohl(ack);
1406 t1->doff = sizeof(*t1)/4;
1407 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1408 if (sk->debug)
1409 printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1410 tcp_statistics.TcpOutSegs++;
1411 sk->prot->queue_xmit(sk, dev, buff, 1);
1412 }1413
1414
1415 /* 1416 * This routine builds a generic TCP header. 1417 */1418
1419 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1420 {1421
1422 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1423 th->seq = htonl(sk->write_seq);
1424 th->psh =(push == 0) ? 1 : 0;
1425 th->doff = sizeof(*th)/4;
1426 th->ack = 1;
1427 th->fin = 0;
1428 sk->ack_backlog = 0;
1429 sk->bytes_rcv = 0;
1430 sk->ack_timed = 0;
1431 th->ack_seq = htonl(sk->acked_seq);
1432 sk->window = tcp_select_window(sk);
1433 th->window = htons(sk->window);
1434
1435 return(sizeof(*th));
1436 }1437
1438 /*1439 * This routine copies from a user buffer into a socket,1440 * and starts the transmit system.1441 */1442
1443 staticinttcp_write(structsock *sk, unsignedchar *from,
/* */1444 intlen, intnonblock, unsignedflags)
1445 {1446 intcopied = 0;
1447 intcopy;
1448 inttmp;
1449 structsk_buff *skb;
1450 structsk_buff *send_tmp;
1451 unsignedchar *buff;
1452 structproto *prot;
1453 structdevice *dev = NULL;
1454
1455 sk->inuse=1;
1456 prot = sk->prot;
1457 while(len > 0)
1458 {1459 if (sk->err)
1460 {/* Stop on an error */1461 release_sock(sk);
1462 if (copied)
1463 return(copied);
1464 tmp = -sk->err;
1465 sk->err = 0;
1466 return(tmp);
1467 }1468
1469 /*1470 * First thing we do is make sure that we are established. 1471 */1472
1473 if (sk->shutdown & SEND_SHUTDOWN)
1474 {1475 release_sock(sk);
1476 sk->err = EPIPE;
1477 if (copied)
1478 return(copied);
1479 sk->err = 0;
1480 return(-EPIPE);
1481 }1482
1483 /* 1484 * Wait for a connection to finish.1485 */1486
1487 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1488 {1489 if (sk->err)
1490 {1491 release_sock(sk);
1492 if (copied)
1493 return(copied);
1494 tmp = -sk->err;
1495 sk->err = 0;
1496 return(tmp);
1497 }1498
1499 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1500 {1501 release_sock(sk);
1502 if (copied)
1503 return(copied);
1504
1505 if (sk->err)
1506 {1507 tmp = -sk->err;
1508 sk->err = 0;
1509 return(tmp);
1510 }1511
1512 if (sk->keepopen)
1513 {1514 send_sig(SIGPIPE, current, 0);
1515 }1516 return(-EPIPE);
1517 }1518
1519 if (nonblock || copied)
1520 {1521 release_sock(sk);
1522 if (copied)
1523 return(copied);
1524 return(-EAGAIN);
1525 }1526
1527 release_sock(sk);
1528 cli();
1529
1530 if (sk->state != TCP_ESTABLISHED &&
1531 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1532 {1533 interruptible_sleep_on(sk->sleep);
1534 if (current->signal & ~current->blocked)
1535 {1536 sti();
1537 if (copied)
1538 return(copied);
1539 return(-ERESTARTSYS);
1540 }1541 }1542 sk->inuse = 1;
1543 sti();
1544 }1545
1546 /*1547 * The following code can result in copy <= if sk->mss is ever1548 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1549 * sk->mtu is constant once SYN processing is finished. I.e. we1550 * had better not get here until we've seen his SYN and at least one1551 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1552 * But ESTABLISHED should guarantee that. sk->max_window is by definition1553 * non-decreasing. Note that any ioctl to set user_mss must be done1554 * before the exchange of SYN's. If the initial ack from the other1555 * end has a window of 0, max_window and thus mss will both be 0.1556 */1557
1558 /* 1559 * Now we need to check if we have a half built packet. 1560 */1561
1562 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1563 {1564 inthdrlen;
1565
1566 /* IP header + TCP header */1567 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1568 + sizeof(structtcphdr);
1569
1570 /* Add more stuff to the end of skb->len */1571 if (!(flags & MSG_OOB))
1572 {1573 copy = min(sk->mss - (skb->len - hdrlen), len);
1574 /* FIXME: this is really a bug. */1575 if (copy <= 0)
1576 {1577 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1578 copy = 0;
1579 }1580
1581 memcpy_fromfs(skb->data + skb->len, from, copy);
1582 skb->len += copy;
1583 from += copy;
1584 copied += copy;
1585 len -= copy;
1586 sk->write_seq += copy;
1587 }1588 if ((skb->len - hdrlen) >= sk->mss ||
1589 (flags & MSG_OOB) || !sk->packets_out)
1590 tcp_send_skb(sk, skb);
1591 else1592 tcp_enqueue_partial(skb, sk);
1593 continue;
1594 }1595
1596 /*1597 * We also need to worry about the window.1598 * If window < 1/2 the maximum window we've seen from this1599 * host, don't use it. This is sender side1600 * silly window prevention, as specified in RFC1122.1601 * (Note that this is different than earlier versions of1602 * SWS prevention, e.g. RFC813.). What we actually do is 1603 * use the whole MSS. Since the results in the right1604 * edge of the packet being outside the window, it will1605 * be queued for later rather than sent.1606 */1607
1608 copy = sk->window_seq - sk->write_seq;
1609 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1610 copy = sk->mss;
1611 if (copy > len)
1612 copy = len;
1613
1614 /*1615 * We should really check the window here also. 1616 */1617
1618 send_tmp = NULL;
1619 if (copy < sk->mss && !(flags & MSG_OOB))
1620 {1621 /*1622 * We will release the socket in case we sleep here. 1623 */1624 release_sock(sk);
1625 /*1626 * NB: following must be mtu, because mss can be increased.1627 * mss is always <= mtu 1628 */1629 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1630 sk->inuse = 1;
1631 send_tmp = skb;
1632 }1633 else1634 {1635 /*1636 * We will release the socket in case we sleep here. 1637 */1638 release_sock(sk);
1639 skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1640 sk->inuse = 1;
1641 }1642
1643 /*1644 * If we didn't get any memory, we need to sleep. 1645 */1646
1647 if (skb == NULL)
1648 {1649 sk->socket->flags |= SO_NOSPACE;
1650 if (nonblock)
1651 {1652 release_sock(sk);
1653 if (copied)
1654 return(copied);
1655 return(-EAGAIN);
1656 }1657
1658 /*1659 * FIXME: here is another race condition. 1660 */1661
1662 tmp = sk->wmem_alloc;
1663 release_sock(sk);
1664 cli();
1665 /*1666 * Again we will try to avoid it. 1667 */1668 if (tmp <= sk->wmem_alloc &&
1669 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1670 && sk->err == 0)
1671 {1672 sk->socket->flags &= ~SO_NOSPACE;
1673 interruptible_sleep_on(sk->sleep);
1674 if (current->signal & ~current->blocked)
1675 {1676 sti();
1677 if (copied)
1678 return(copied);
1679 return(-ERESTARTSYS);
1680 }1681 }1682 sk->inuse = 1;
1683 sti();
1684 continue;
1685 }1686
1687 skb->len = 0;
1688 skb->sk = sk;
1689 skb->free = 0;
1690 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1691
1692 buff = skb->data;
1693
1694 /*1695 * FIXME: we need to optimize this.1696 * Perhaps some hints here would be good.1697 */1698
1699 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1700 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1701 if (tmp < 0 )
1702 {1703 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1704 release_sock(sk);
1705 if (copied)
1706 return(copied);
1707 return(tmp);
1708 }1709 skb->len += tmp;
1710 skb->dev = dev;
1711 buff += tmp;
1712 skb->h.th =(structtcphdr *) buff;
1713 tmp = tcp_build_header((structtcphdr *)buff, sk, len-copy);
1714 if (tmp < 0)
1715 {1716 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1717 release_sock(sk);
1718 if (copied)
1719 return(copied);
1720 return(tmp);
1721 }1722
1723 if (flags & MSG_OOB)
1724 {1725 ((structtcphdr *)buff)->urg = 1;
1726 ((structtcphdr *)buff)->urg_ptr = ntohs(copy);
1727 }1728 skb->len += tmp;
1729 memcpy_fromfs(buff+tmp, from, copy);
1730
1731 from += copy;
1732 copied += copy;
1733 len -= copy;
1734 skb->len += copy;
1735 skb->free = 0;
1736 sk->write_seq += copy;
1737
1738 if (send_tmp != NULL && sk->packets_out)
1739 {1740 tcp_enqueue_partial(send_tmp, sk);
1741 continue;
1742 }1743 tcp_send_skb(sk, skb);
1744 }1745 sk->err = 0;
1746
1747 /*1748 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1749 * interactive fast network servers. It's meant to be on and1750 * it really improves the throughput though not the echo time1751 * on my slow slip link - Alan1752 */1753
1754 /*1755 * Avoid possible race on send_tmp - c/o Johannes Stille 1756 */1757
1758 if(sk->partial && ((!sk->packets_out)
1759 /* If not nagling we can send on the before case too.. */1760 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1761 ))
1762 tcp_send_partial(sk);
1763
1764 release_sock(sk);
1765 return(copied);
1766 }1767
1768 /*1769 * This is just a wrapper. 1770 */1771
1772 staticinttcp_sendto(structsock *sk, unsignedchar *from,
/* */1773 intlen, intnonblock, unsignedflags,
1774 structsockaddr_in *addr, intaddr_len)
1775 {1776 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1777 return -EINVAL;
1778 if (sk->state == TCP_CLOSE)
1779 return -ENOTCONN;
1780 if (addr_len < sizeof(*addr))
1781 return -EINVAL;
1782 if (addr->sin_family && addr->sin_family != AF_INET)
1783 return -EINVAL;
1784 if (addr->sin_port != sk->dummy_th.dest)
1785 return -EISCONN;
1786 if (addr->sin_addr.s_addr != sk->daddr)
1787 return -EISCONN;
1788 returntcp_write(sk, from, len, nonblock, flags);
1789 }1790
1791
1792 /*1793 * Send an ack if one is backlogged at this point. Ought to merge1794 * this with tcp_send_ack().1795 */1796
1797 staticvoidtcp_read_wakeup(structsock *sk)
/* */1798 {1799 inttmp;
1800 structdevice *dev = NULL;
1801 structtcphdr *t1;
1802 structsk_buff *buff;
1803
1804 if (!sk->ack_backlog)
1805 return;
1806
1807 /*1808 * If we're closed, don't send an ack, or we'll get a RST1809 * from the closed destination.1810 */1811 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1812 return;
1813
1814 /*1815 * FIXME: we need to put code here to prevent this routine from1816 * being called. Being called once in a while is ok, so only check1817 * if this is the second time in a row.1818 */1819
1820 /*1821 * We need to grab some memory, and put together an ack,1822 * and then put it into the queue to be sent.1823 */1824
1825 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1826 if (buff == NULL)
1827 {1828 /* Try again real soon. */1829 reset_xmit_timer(sk, TIME_WRITE, HZ);
1830 return;
1831 }1832
1833 buff->len = sizeof(structtcphdr);
1834 buff->sk = sk;
1835 buff->localroute = sk->localroute;
1836
1837 /*1838 * Put in the IP header and routing stuff. 1839 */1840
1841 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1842 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1843 if (tmp < 0)
1844 {1845 buff->free = 1;
1846 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1847 return;
1848 }1849
1850 buff->len += tmp;
1851 t1 =(structtcphdr *)(buff->data +tmp);
1852
1853 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1854 t1->seq = htonl(sk->sent_seq);
1855 t1->ack = 1;
1856 t1->res1 = 0;
1857 t1->res2 = 0;
1858 t1->rst = 0;
1859 t1->urg = 0;
1860 t1->syn = 0;
1861 t1->psh = 0;
1862 sk->ack_backlog = 0;
1863 sk->bytes_rcv = 0;
1864 sk->window = tcp_select_window(sk);
1865 t1->window = ntohs(sk->window);
1866 t1->ack_seq = ntohl(sk->acked_seq);
1867 t1->doff = sizeof(*t1)/4;
1868 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1869 sk->prot->queue_xmit(sk, dev, buff, 1);
1870 tcp_statistics.TcpOutSegs++;
1871 }1872
1873
1874 /*1875 * FIXME:1876 * This routine frees used buffers.1877 * It should consider sending an ACK to let the1878 * other end know we now have a bigger window.1879 */1880
1881 staticvoidcleanup_rbuf(structsock *sk)
/* */1882 {1883 unsignedlongflags;
1884 unsignedlongleft;
1885 structsk_buff *skb;
1886 unsignedlongrspace;
1887
1888 if(sk->debug)
1889 printk("cleaning rbuf for sk=%p\n", sk);
1890
1891 save_flags(flags);
1892 cli();
1893
1894 left = sk->prot->rspace(sk);
1895
1896 /*1897 * We have to loop through all the buffer headers,1898 * and try to free up all the space we can.1899 */1900
1901 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1902 {1903 if (!skb->used || skb->users)
1904 break;
1905 skb_unlink(skb);
1906 skb->sk = sk;
1907 kfree_skb(skb, FREE_READ);
1908 }1909
1910 restore_flags(flags);
1911
1912 /*1913 * FIXME:1914 * At this point we should send an ack if the difference1915 * in the window, and the amount of space is bigger than1916 * TCP_WINDOW_DIFF.1917 */1918
1919 if(sk->debug)
1920 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1921 left);
1922 if ((rspace=sk->prot->rspace(sk)) != left)
1923 {1924 /*1925 * This area has caused the most trouble. The current strategy1926 * is to simply do nothing if the other end has room to send at1927 * least 3 full packets, because the ack from those will auto-1928 * matically update the window. If the other end doesn't think1929 * we have much space left, but we have room for at least 1 more1930 * complete packet than it thinks we do, we will send an ack1931 * immediately. Otherwise we will wait up to .5 seconds in case1932 * the user reads some more.1933 */1934 sk->ack_backlog++;
1935 /*1936 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1937 * if the other end is offering a window smaller than the agreed on MSS1938 * (called sk->mtu here). In theory there's no connection between send1939 * and receive, and so no reason to think that they're going to send1940 * small packets. For the moment I'm using the hack of reducing the mss1941 * only on the send side, so I'm putting mtu here.1942 */1943
1944 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1945 {1946 /* Send an ack right now. */1947 tcp_read_wakeup(sk);
1948 }1949 else1950 {1951 /* Force it to send an ack soon. */1952 intwas_active = del_timer(&sk->retransmit_timer);
1953 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1954 {1955 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1956 }1957 else1958 add_timer(&sk->retransmit_timer);
1959 }1960 }1961 }1962
1963
1964 /*1965 * Handle reading urgent data. BSD has very simple semantics for1966 * this, no blocking and very strange errors 8)1967 */1968
1969 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */1970 unsignedchar *to, intlen, unsignedflags)
1971 {1972 /*1973 * No URG data to read1974 */1975 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1976 return -EINVAL; /* Yes this is right ! */1977
1978 if (sk->err)
1979 {1980 inttmp = -sk->err;
1981 sk->err = 0;
1982 returntmp;
1983 }1984
1985 if (sk->state == TCP_CLOSE || sk->done)
1986 {1987 if (!sk->done) {1988 sk->done = 1;
1989 return 0;
1990 }1991 return -ENOTCONN;
1992 }1993
1994 if (sk->shutdown & RCV_SHUTDOWN)
1995 {1996 sk->done = 1;
1997 return 0;
1998 }1999 sk->inuse = 1;
2000 if (sk->urg_data & URG_VALID)
2001 {2002 charc = sk->urg_data;
2003 if (!(flags & MSG_PEEK))
2004 sk->urg_data = URG_READ;
2005 put_fs_byte(c, to);
2006 release_sock(sk);
2007 return 1;
2008 }2009 release_sock(sk);
2010
2011 /*2012 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2013 * the available implementations agree in this case:2014 * this call should never block, independent of the2015 * blocking state of the socket.2016 * Mike <pall@rz.uni-karlsruhe.de>2017 */2018 return -EAGAIN;
2019 }2020
2021
2022 /*2023 * This routine copies from a sock struct into the user buffer. 2024 */2025
2026 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */2027 intlen, intnonblock, unsignedflags)
2028 {2029 structwait_queuewait = {current, NULL};
2030 intcopied = 0;
2031 unsignedlongpeek_seq;
2032 volatileunsignedlong *seq; /* So gcc doesn't overoptimise */2033 unsignedlongused;
2034
2035 /* 2036 * This error should be checked. 2037 */2038
2039 if (sk->state == TCP_LISTEN)
2040 return -ENOTCONN;
2041
2042 /*2043 * Urgent data needs to be handled specially. 2044 */2045
2046 if (flags & MSG_OOB)
2047 returntcp_read_urg(sk, nonblock, to, len, flags);
2048
2049 /*2050 * Copying sequence to update. This is volatile to handle2051 * the multi-reader case neatly (memcpy_to/fromfs might be 2052 * inline and thus not flush cached variables otherwise).2053 */2054
2055 peek_seq = sk->copied_seq;
2056 seq = &sk->copied_seq;
2057 if (flags & MSG_PEEK)
2058 seq = &peek_seq;
2059
2060 add_wait_queue(sk->sleep, &wait);
2061 sk->inuse = 1;
2062 while (len > 0)
2063 {2064 structsk_buff * skb;
2065 unsignedlongoffset;
2066
2067 /*2068 * Are we at urgent data? Stop if we have read anything.2069 */2070
2071 if (copied && sk->urg_data && sk->urg_seq == *seq)
2072 break;
2073
2074 /*2075 * Next get a buffer.2076 */2077
2078 current->state = TASK_INTERRUPTIBLE;
2079
2080 skb = skb_peek(&sk->receive_queue);
2081 do2082 {2083 if (!skb)
2084 break;
2085 if (before(*seq, skb->h.th->seq))
2086 break;
2087 offset = *seq - skb->h.th->seq;
2088 if (skb->h.th->syn)
2089 offset--;
2090 if (offset < skb->len)
2091 gotofound_ok_skb;
2092 if (skb->h.th->fin)
2093 gotofound_fin_ok;
2094 if (!(flags & MSG_PEEK))
2095 skb->used = 1;
2096 skb = skb->next;
2097 }2098 while (skb != (structsk_buff *)&sk->receive_queue);
2099
2100 if (copied)
2101 break;
2102
2103 if (sk->err)
2104 {2105 copied = -sk->err;
2106 sk->err = 0;
2107 break;
2108 }2109
2110 if (sk->state == TCP_CLOSE)
2111 {2112 if (!sk->done)
2113 {2114 sk->done = 1;
2115 break;
2116 }2117 copied = -ENOTCONN;
2118 break;
2119 }2120
2121 if (sk->shutdown & RCV_SHUTDOWN)
2122 {2123 sk->done = 1;
2124 break;
2125 }2126
2127 if (nonblock)
2128 {2129 copied = -EAGAIN;
2130 break;
2131 }2132
2133 cleanup_rbuf(sk);
2134 release_sock(sk);
2135 sk->socket->flags |= SO_WAITDATA;
2136 schedule();
2137 sk->socket->flags &= ~SO_WAITDATA;
2138 sk->inuse = 1;
2139
2140 if (current->signal & ~current->blocked)
2141 {2142 copied = -ERESTARTSYS;
2143 break;
2144 }2145 continue;
2146
2147 found_ok_skb:
2148 /*2149 * Lock the buffer. We can be fairly relaxed as2150 * an interrupt will never steal a buffer we are 2151 * using unless I've missed something serious in2152 * tcp_data.2153 */2154
2155 skb->users++;
2156
2157 /*2158 * Ok so how much can we use ? 2159 */2160
2161 used = skb->len - offset;
2162 if (len < used)
2163 used = len;
2164 /*2165 * Do we have urgent data here? 2166 */2167
2168 if (sk->urg_data)
2169 {2170 unsignedlongurg_offset = sk->urg_seq - *seq;
2171 if (urg_offset < used)
2172 {2173 if (!urg_offset)
2174 {2175 if (!sk->urginline)
2176 {2177 ++*seq;
2178 offset++;
2179 used--;
2180 }2181 }2182 else2183 used = urg_offset;
2184 }2185 }2186
2187 /*2188 * Copy it - We _MUST_ update *seq first so that we2189 * don't ever double read when we have dual readers2190 */2191
2192 *seq += used;
2193
2194 /*2195 * This memcpy_tofs can sleep. If it sleeps and we2196 * do a second read it relies on the skb->users to avoid2197 * a crash when cleanup_rbuf() gets called.2198 */2199
2200 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2201 skb->h.th->doff*4 + offset, used);
2202 copied += used;
2203 len -= used;
2204 to += used;
2205
2206 /*2207 * We now will not sleep again until we are finished2208 * with skb. Sorry if you are doing the SMP port2209 * but you'll just have to fix it neatly ;)2210 */2211
2212 skb->users --;
2213
2214 if (after(sk->copied_seq,sk->urg_seq))
2215 sk->urg_data = 0;
2216 if (used + offset < skb->len)
2217 continue;
2218
2219 /*2220 * Process the FIN.2221 */2222
2223 if (skb->h.th->fin)
2224 gotofound_fin_ok;
2225 if (flags & MSG_PEEK)
2226 continue;
2227 skb->used = 1;
2228 continue;
2229
2230 found_fin_ok:
2231 ++*seq;
2232 if (flags & MSG_PEEK)
2233 break;
2234
2235 /*2236 * All is done2237 */2238
2239 skb->used = 1;
2240 sk->shutdown |= RCV_SHUTDOWN;
2241 break;
2242
2243 }2244 remove_wait_queue(sk->sleep, &wait);
2245 current->state = TASK_RUNNING;
2246
2247 /* Clean up data we have read: This will do ACK frames */2248 cleanup_rbuf(sk);
2249 release_sock(sk);
2250 returncopied;
2251 }2252
2253 /*2254 * State processing on a close. This implements the state shift for2255 * sending our FIN frame. Note that we only send a FIN for some 2256 * states. A shutdown() may have already sent the FIN, or we may be2257 * closed.2258 */2259
2260 staticinttcp_close_state(structsock *sk, intdead)
/* */2261 {2262 intns=TCP_CLOSE;
2263 intsend_fin=0;
2264 switch(sk->state)
2265 {2266 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2267 break;
2268 caseTCP_SYN_RECV:
2269 caseTCP_ESTABLISHED: /* Closedown begin */2270 ns=TCP_FIN_WAIT1;
2271 send_fin=1;
2272 break;
2273 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2274 caseTCP_FIN_WAIT2:
2275 caseTCP_CLOSING:
2276 ns=sk->state;
2277 break;
2278 caseTCP_CLOSE:
2279 caseTCP_LISTEN:
2280 break;
2281 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2282 wait only for the ACK */2283 ns=TCP_LAST_ACK;
2284 send_fin=1;
2285 }2286
2287 tcp_set_state(sk,ns);
2288
2289 /*2290 * This is a (useful) BSD violating of the RFC. There is a2291 * problem with TCP as specified in that the other end could2292 * keep a socket open forever with no application left this end.2293 * We use a 3 minute timeout (about the same as BSD) then kill2294 * our end. If they send after that then tough - BUT: long enough2295 * that we won't make the old 4*rto = almost no time - whoops2296 * reset mistake.2297 */2298 if(dead && ns==TCP_FIN_WAIT2)
2299 {2300 inttimer_active=del_timer(&sk->timer);
2301 if(timer_active)
2302 add_timer(&sk->timer);
2303 else2304 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2305 }2306
2307 returnsend_fin;
2308 }2309
2310 /*2311 * Send a fin.2312 */2313
2314 staticvoidtcp_send_fin(structsock *sk)
/* */2315 {2316 structproto *prot =(structproto *)sk->prot;
2317 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2318 structtcphdr *t1;
2319 structsk_buff *buff;
2320 structdevice *dev=NULL;
2321 inttmp;
2322
2323 release_sock(sk); /* in case the malloc sleeps. */2324
2325 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2326 sk->inuse = 1;
2327
2328 if (buff == NULL)
2329 {2330 /* This is a disaster if it occurs */2331 printk("tcp_send_fin: Impossible malloc failure");
2332 return;
2333 }2334
2335 /*2336 * Administrivia2337 */2338
2339 buff->sk = sk;
2340 buff->len = sizeof(*t1);
2341 buff->localroute = sk->localroute;
2342 t1 =(structtcphdr *) buff->data;
2343
2344 /*2345 * Put in the IP header and routing stuff. 2346 */2347
2348 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2349 IPPROTO_TCP, sk->opt,
2350 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2351 if (tmp < 0)
2352 {2353 intt;
2354 /*2355 * Finish anyway, treat this as a send that got lost. 2356 * (Not good).2357 */2358
2359 buff->free = 1;
2360 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2361 sk->write_seq++;
2362 t=del_timer(&sk->timer);
2363 if(t)
2364 add_timer(&sk->timer);
2365 else2366 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2367 return;
2368 }2369
2370 /*2371 * We ought to check if the end of the queue is a buffer and2372 * if so simply add the fin to that buffer, not send it ahead.2373 */2374
2375 t1 =(structtcphdr *)((char *)t1 +tmp);
2376 buff->len += tmp;
2377 buff->dev = dev;
2378 memcpy(t1, th, sizeof(*t1));
2379 t1->seq = ntohl(sk->write_seq);
2380 sk->write_seq++;
2381 buff->h.seq = sk->write_seq;
2382 t1->ack = 1;
2383 t1->ack_seq = ntohl(sk->acked_seq);
2384 t1->window = ntohs(sk->window=tcp_select_window(sk));
2385 t1->fin = 1;
2386 t1->rst = 0;
2387 t1->doff = sizeof(*t1)/4;
2388 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2389
2390 /*2391 * If there is data in the write queue, the fin must be appended to2392 * the write queue.2393 */2394
2395 if (skb_peek(&sk->write_queue) != NULL)
2396 {2397 buff->free = 0;
2398 if (buff->next != NULL)
2399 {2400 printk("tcp_send_fin: next != NULL\n");
2401 skb_unlink(buff);
2402 }2403 skb_queue_tail(&sk->write_queue, buff);
2404 }2405 else2406 {2407 sk->sent_seq = sk->write_seq;
2408 sk->prot->queue_xmit(sk, dev, buff, 0);
2409 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2410 }2411 }2412
2413 /*2414 * Shutdown the sending side of a connection. Much like close except2415 * that we don't receive shut down or set sk->dead=1.2416 */2417
2418 voidtcp_shutdown(structsock *sk, inthow)
/* */2419 {2420 /*2421 * We need to grab some memory, and put together a FIN,2422 * and then put it into the queue to be sent.2423 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2424 */2425
2426 if (!(how & SEND_SHUTDOWN))
2427 return;
2428
2429 /*2430 * If we've already sent a FIN, or it's a closed state2431 */2432
2433 if (sk->state == TCP_FIN_WAIT1 ||
2434 sk->state == TCP_FIN_WAIT2 ||
2435 sk->state == TCP_CLOSING ||
2436 sk->state == TCP_LAST_ACK ||
2437 sk->state == TCP_TIME_WAIT ||
2438 sk->state == TCP_CLOSE ||
2439 sk->state == TCP_LISTEN2440 )
2441 {2442 return;
2443 }2444 sk->inuse = 1;
2445
2446 /*2447 * flag that the sender has shutdown2448 */2449
2450 sk->shutdown |= SEND_SHUTDOWN;
2451
2452 /*2453 * Clear out any half completed packets. 2454 */2455
2456 if (sk->partial)
2457 tcp_send_partial(sk);
2458
2459 /*2460 * FIN if needed2461 */2462
2463 if(tcp_close_state(sk,0))
2464 tcp_send_fin(sk);
2465
2466 release_sock(sk);
2467 }2468
2469
2470 staticint2471 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2472 intto_len, intnonblock, unsignedflags,
2473 structsockaddr_in *addr, int *addr_len)
2474 {2475 intresult;
2476
2477 /* 2478 * Have to check these first unlike the old code. If 2479 * we check them after we lose data on an error2480 * which is wrong 2481 */2482
2483 if(addr_len)
2484 *addr_len = sizeof(*addr);
2485 result=tcp_read(sk, to, to_len, nonblock, flags);
2486
2487 if (result < 0)
2488 return(result);
2489
2490 if(addr)
2491 {2492 addr->sin_family = AF_INET;
2493 addr->sin_port = sk->dummy_th.dest;
2494 addr->sin_addr.s_addr = sk->daddr;
2495 }2496 return(result);
2497 }2498
2499
2500 /*2501 * This routine will send an RST to the other tcp. 2502 */2503
2504 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2505 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2506 {2507 structsk_buff *buff;
2508 structtcphdr *t1;
2509 inttmp;
2510 structdevice *ndev=NULL;
2511
2512 /*2513 * Cannot reset a reset (Think about it).2514 */2515
2516 if(th->rst)
2517 return;
2518
2519 /*2520 * We need to grab some memory, and put together an RST,2521 * and then put it into the queue to be sent.2522 */2523
2524 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2525 if (buff == NULL)
2526 return;
2527
2528 buff->len = sizeof(*t1);
2529 buff->sk = NULL;
2530 buff->dev = dev;
2531 buff->localroute = 0;
2532
2533 t1 =(structtcphdr *) buff->data;
2534
2535 /*2536 * Put in the IP header and routing stuff. 2537 */2538
2539 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2540 sizeof(structtcphdr),tos,ttl);
2541 if (tmp < 0)
2542 {2543 buff->free = 1;
2544 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2545 return;
2546 }2547
2548 t1 =(structtcphdr *)((char *)t1 +tmp);
2549 buff->len += tmp;
2550 memcpy(t1, th, sizeof(*t1));
2551
2552 /*2553 * Swap the send and the receive. 2554 */2555
2556 t1->dest = th->source;
2557 t1->source = th->dest;
2558 t1->rst = 1;
2559 t1->window = 0;
2560
2561 if(th->ack)
2562 {2563 t1->ack = 0;
2564 t1->seq = th->ack_seq;
2565 t1->ack_seq = 0;
2566 }2567 else2568 {2569 t1->ack = 1;
2570 if(!th->syn)
2571 t1->ack_seq=htonl(th->seq);
2572 else2573 t1->ack_seq=htonl(th->seq+1);
2574 t1->seq=0;
2575 }2576
2577 t1->syn = 0;
2578 t1->urg = 0;
2579 t1->fin = 0;
2580 t1->psh = 0;
2581 t1->doff = sizeof(*t1)/4;
2582 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2583 prot->queue_xmit(NULL, ndev, buff, 1);
2584 tcp_statistics.TcpOutSegs++;
2585 }2586
2587
2588 /*2589 * Look for tcp options. Parses everything but only knows about MSS.2590 * This routine is always called with the packet containing the SYN.2591 * However it may also be called with the ack to the SYN. So you2592 * can't assume this is always the SYN. It's always called after2593 * we have set up sk->mtu to our own MTU.2594 *2595 * We need at minimum to add PAWS support here. Possibly large windows2596 * as Linux gets deployed on 100Mb/sec networks.2597 */2598
2599 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2600 {2601 unsignedchar *ptr;
2602 intlength=(th->doff*4)-sizeof(structtcphdr);
2603 intmss_seen = 0;
2604
2605 ptr = (unsignedchar *)(th + 1);
2606
2607 while(length>0)
2608 {2609 intopcode=*ptr++;
2610 intopsize=*ptr++;
2611 switch(opcode)
2612 {2613 caseTCPOPT_EOL:
2614 return;
2615 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2616 length--;
2617 ptr--; /* the opsize=*ptr++ above was a mistake */2618 continue;
2619
2620 default:
2621 if(opsize<=2) /* Avoid silly options looping forever */2622 return;
2623 switch(opcode)
2624 {2625 caseTCPOPT_MSS:
2626 if(opsize==4 && th->syn)
2627 {2628 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2629 mss_seen = 1;
2630 }2631 break;
2632 /* Add other options here as people feel the urge to implement stuff like large windows */2633 }2634 ptr+=opsize-2;
2635 length-=opsize;
2636 }2637 }2638 if (th->syn)
2639 {2640 if (! mss_seen)
2641 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2642 }2643 #ifdefCONFIG_INET_PCTCP2644 sk->mss = min(sk->max_window >> 1, sk->mtu);
2645 #else2646 sk->mss = min(sk->max_window, sk->mtu);
2647 #endif2648 }2649
2650 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2651 {2652 dst = ntohl(dst);
2653 if (IN_CLASSA(dst))
2654 returnhtonl(IN_CLASSA_NET);
2655 if (IN_CLASSB(dst))
2656 returnhtonl(IN_CLASSB_NET);
2657 returnhtonl(IN_CLASSC_NET);
2658 }2659
2660 /*2661 * Default sequence number picking algorithm.2662 * As close as possible to RFC 793, which2663 * suggests using a 250kHz clock.2664 * Further reading shows this assumes 2MB/s networks.2665 * For 10MB/s ethernet, a 1MHz clock is appropriate.2666 * That's funny, Linux has one built in! Use it!2667 */2668
2669 externinlineunsignedlongtcp_init_seq(void)
/* */2670 {2671 structtimevaltv;
2672 do_gettimeofday(&tv);
2673 returntv.tv_usec+tv.tv_sec*1000000;
2674 }2675
2676 /*2677 * This routine handles a connection request.2678 * It should make sure we haven't already responded.2679 * Because of the way BSD works, we have to send a syn/ack now.2680 * This also means it will be harder to close a socket which is2681 * listening.2682 */2683
2684 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2685 unsignedlongdaddr, unsignedlongsaddr,
2686 structoptions *opt, structdevice *dev, unsignedlongseq)
2687 {2688 structsk_buff *buff;
2689 structtcphdr *t1;
2690 unsignedchar *ptr;
2691 structsock *newsk;
2692 structtcphdr *th;
2693 structdevice *ndev=NULL;
2694 inttmp;
2695 structrtable *rt;
2696
2697 th = skb->h.th;
2698
2699 /* If the socket is dead, don't accept the connection. */2700 if (!sk->dead)
2701 {2702 sk->data_ready(sk,0);
2703 }2704 else2705 {2706 if(sk->debug)
2707 printk("Reset on %p: Connect on dead socket.\n",sk);
2708 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2709 tcp_statistics.TcpAttemptFails++;
2710 kfree_skb(skb, FREE_READ);
2711 return;
2712 }2713
2714 /*2715 * Make sure we can accept more. This will prevent a2716 * flurry of syns from eating up all our memory.2717 */2718
2719 if (sk->ack_backlog >= sk->max_ack_backlog)
2720 {2721 tcp_statistics.TcpAttemptFails++;
2722 kfree_skb(skb, FREE_READ);
2723 return;
2724 }2725
2726 /*2727 * We need to build a new sock struct.2728 * It is sort of bad to have a socket without an inode attached2729 * to it, but the wake_up's will just wake up the listening socket,2730 * and if the listening socket is destroyed before this is taken2731 * off of the queue, this will take care of it.2732 */2733
2734 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2735 if (newsk == NULL)
2736 {2737 /* just ignore the syn. It will get retransmitted. */2738 tcp_statistics.TcpAttemptFails++;
2739 kfree_skb(skb, FREE_READ);
2740 return;
2741 }2742
2743 memcpy(newsk, sk, sizeof(*newsk));
2744 skb_queue_head_init(&newsk->write_queue);
2745 skb_queue_head_init(&newsk->receive_queue);
2746 newsk->send_head = NULL;
2747 newsk->send_tail = NULL;
2748 skb_queue_head_init(&newsk->back_log);
2749 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2750 newsk->rto = TCP_TIMEOUT_INIT;
2751 newsk->mdev = 0;
2752 newsk->max_window = 0;
2753 newsk->cong_window = 1;
2754 newsk->cong_count = 0;
2755 newsk->ssthresh = 0;
2756 newsk->backoff = 0;
2757 newsk->blog = 0;
2758 newsk->intr = 0;
2759 newsk->proc = 0;
2760 newsk->done = 0;
2761 newsk->partial = NULL;
2762 newsk->pair = NULL;
2763 newsk->wmem_alloc = 0;
2764 newsk->rmem_alloc = 0;
2765 newsk->localroute = sk->localroute;
2766
2767 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2768
2769 newsk->err = 0;
2770 newsk->shutdown = 0;
2771 newsk->ack_backlog = 0;
2772 newsk->acked_seq = skb->h.th->seq+1;
2773 newsk->copied_seq = skb->h.th->seq+1;
2774 newsk->fin_seq = skb->h.th->seq;
2775 newsk->state = TCP_SYN_RECV;
2776 newsk->timeout = 0;
2777 newsk->ip_xmit_timeout = 0;
2778 newsk->write_seq = seq;
2779 newsk->window_seq = newsk->write_seq;
2780 newsk->rcv_ack_seq = newsk->write_seq;
2781 newsk->urg_data = 0;
2782 newsk->retransmits = 0;
2783 newsk->linger=0;
2784 newsk->destroy = 0;
2785 init_timer(&newsk->timer);
2786 newsk->timer.data = (unsignedlong)newsk;
2787 newsk->timer.function = &net_timer;
2788 init_timer(&newsk->retransmit_timer);
2789 newsk->retransmit_timer.data = (unsignedlong)newsk;
2790 newsk->retransmit_timer.function=&retransmit_timer;
2791 newsk->dummy_th.source = skb->h.th->dest;
2792 newsk->dummy_th.dest = skb->h.th->source;
2793
2794 /*2795 * Swap these two, they are from our point of view. 2796 */2797
2798 newsk->daddr = saddr;
2799 newsk->saddr = daddr;
2800
2801 put_sock(newsk->num,newsk);
2802 newsk->dummy_th.res1 = 0;
2803 newsk->dummy_th.doff = 6;
2804 newsk->dummy_th.fin = 0;
2805 newsk->dummy_th.syn = 0;
2806 newsk->dummy_th.rst = 0;
2807 newsk->dummy_th.psh = 0;
2808 newsk->dummy_th.ack = 0;
2809 newsk->dummy_th.urg = 0;
2810 newsk->dummy_th.res2 = 0;
2811 newsk->acked_seq = skb->h.th->seq + 1;
2812 newsk->copied_seq = skb->h.th->seq + 1;
2813 newsk->socket = NULL;
2814
2815 /*2816 * Grab the ttl and tos values and use them 2817 */2818
2819 newsk->ip_ttl=sk->ip_ttl;
2820 newsk->ip_tos=skb->ip_hdr->tos;
2821
2822 /*2823 * Use 512 or whatever user asked for 2824 */2825
2826 /*2827 * Note use of sk->user_mss, since user has no direct access to newsk 2828 */2829
2830 rt=ip_rt_route(saddr, NULL,NULL);
2831
2832 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2833 newsk->window_clamp = rt->rt_window;
2834 else2835 newsk->window_clamp = 0;
2836
2837 if (sk->user_mss)
2838 newsk->mtu = sk->user_mss;
2839 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
2840 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2841 else2842 {2843 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */2844 if ((saddr ^ daddr) & default_mask(saddr))
2845 #else2846 if ((saddr ^ daddr) & dev->pa_mask)
2847 #endif2848 newsk->mtu = 576 - HEADER_SIZE;
2849 else2850 newsk->mtu = MAX_WINDOW;
2851 }2852
2853 /*2854 * But not bigger than device MTU 2855 */2856
2857 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2858
2859 /*2860 * This will min with what arrived in the packet 2861 */2862
2863 tcp_options(newsk,skb->h.th);
2864
2865 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2866 if (buff == NULL)
2867 {2868 sk->err = ENOMEM;
2869 newsk->dead = 1;
2870 newsk->state = TCP_CLOSE;
2871 /* And this will destroy it */2872 release_sock(newsk);
2873 kfree_skb(skb, FREE_READ);
2874 tcp_statistics.TcpAttemptFails++;
2875 return;
2876 }2877
2878 buff->len = sizeof(structtcphdr)+4;
2879 buff->sk = newsk;
2880 buff->localroute = newsk->localroute;
2881
2882 t1 =(structtcphdr *) buff->data;
2883
2884 /*2885 * Put in the IP header and routing stuff. 2886 */2887
2888 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2889 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2890
2891 /*2892 * Something went wrong. 2893 */2894
2895 if (tmp < 0)
2896 {2897 sk->err = tmp;
2898 buff->free = 1;
2899 kfree_skb(buff,FREE_WRITE);
2900 newsk->dead = 1;
2901 newsk->state = TCP_CLOSE;
2902 release_sock(newsk);
2903 skb->sk = sk;
2904 kfree_skb(skb, FREE_READ);
2905 tcp_statistics.TcpAttemptFails++;
2906 return;
2907 }2908
2909 buff->len += tmp;
2910 t1 =(structtcphdr *)((char *)t1 +tmp);
2911
2912 memcpy(t1, skb->h.th, sizeof(*t1));
2913 buff->h.seq = newsk->write_seq;
2914 /*2915 * Swap the send and the receive. 2916 */2917 t1->dest = skb->h.th->source;
2918 t1->source = newsk->dummy_th.source;
2919 t1->seq = ntohl(newsk->write_seq++);
2920 t1->ack = 1;
2921 newsk->window = tcp_select_window(newsk);
2922 newsk->sent_seq = newsk->write_seq;
2923 t1->window = ntohs(newsk->window);
2924 t1->res1 = 0;
2925 t1->res2 = 0;
2926 t1->rst = 0;
2927 t1->urg = 0;
2928 t1->psh = 0;
2929 t1->syn = 1;
2930 t1->ack_seq = ntohl(skb->h.th->seq+1);
2931 t1->doff = sizeof(*t1)/4+1;
2932 ptr =(unsignedchar *)(t1+1);
2933 ptr[0] = 2;
2934 ptr[1] = 4;
2935 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2936 ptr[3] =(newsk->mtu) & 0xff;
2937
2938 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2939 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2940 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2941 skb->sk = newsk;
2942
2943 /*2944 * Charge the sock_buff to newsk. 2945 */2946
2947 sk->rmem_alloc -= skb->mem_len;
2948 newsk->rmem_alloc += skb->mem_len;
2949
2950 skb_queue_tail(&sk->receive_queue,skb);
2951 sk->ack_backlog++;
2952 release_sock(newsk);
2953 tcp_statistics.TcpOutSegs++;
2954 }2955
2956
2957 staticvoidtcp_close(structsock *sk, inttimeout)
/* */2958 {2959 /*2960 * We need to grab some memory, and put together a FIN, 2961 * and then put it into the queue to be sent.2962 */2963
2964 sk->inuse = 1;
2965
2966 if(sk->state == TCP_LISTEN)
2967 {2968 /* Special case */2969 tcp_set_state(sk, TCP_CLOSE);
2970 tcp_close_pending(sk);
2971 release_sock(sk);
2972 return;
2973 }2974
2975 sk->keepopen = 1;
2976 sk->shutdown = SHUTDOWN_MASK;
2977
2978 if (!sk->dead)
2979 sk->state_change(sk);
2980
2981 if (timeout == 0)
2982 {2983 structsk_buff *skb;
2984
2985 /*2986 * We need to flush the recv. buffs. We do this only on the2987 * descriptor close, not protocol-sourced closes, because the2988 * reader process may not have drained the data yet!2989 */2990
2991 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2992 kfree_skb(skb, FREE_READ);
2993 /*2994 * Get rid off any half-completed packets. 2995 */2996
2997 if (sk->partial)
2998 tcp_send_partial(sk);
2999 }3000
3001
3002 /*3003 * Timeout is not the same thing - however the code likes3004 * to send both the same way (sigh).3005 */3006
3007 if(timeout)
3008 {3009 tcp_set_state(sk, TCP_CLOSE); /* Dead */3010 }3011 else3012 {3013 if(tcp_close_state(sk,1)==1)
3014 {3015 tcp_send_fin(sk);
3016 }3017 }3018 release_sock(sk);
3019 }3020
3021
3022 /*3023 * This routine takes stuff off of the write queue,3024 * and puts it in the xmit queue. This happens as incoming acks3025 * open up the remote window for us.3026 */3027
3028 staticvoidtcp_write_xmit(structsock *sk)
/* */3029 {3030 structsk_buff *skb;
3031
3032 /*3033 * The bytes will have to remain here. In time closedown will3034 * empty the write queue and all will be happy 3035 */3036
3037 if(sk->zapped)
3038 return;
3039
3040 /*3041 * Anything on the transmit queue that fits the window can3042 * be added providing we are not3043 *3044 * a) retransmitting (Nagle's rule)3045 * b) exceeding our congestion window.3046 */3047
3048 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3049 before(skb->h.seq, sk->window_seq + 1) &&
3050 (sk->retransmits == 0 ||
3051 sk->ip_xmit_timeout != TIME_WRITE ||
3052 before(skb->h.seq, sk->rcv_ack_seq + 1))
3053 && sk->packets_out < sk->cong_window)
3054 {3055 IS_SKB(skb);
3056 skb_unlink(skb);
3057
3058 /*3059 * See if we really need to send the packet. 3060 */3061
3062 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3063 {3064 /*3065 * This is acked data. We can discard it. This 3066 * cannot currently occur.3067 */3068
3069 sk->retransmits = 0;
3070 kfree_skb(skb, FREE_WRITE);
3071 if (!sk->dead)
3072 sk->write_space(sk);
3073 }3074 else3075 {3076 structtcphdr *th;
3077 structiphdr *iph;
3078 intsize;
3079 /*3080 * put in the ack seq and window at this point rather than earlier,3081 * in order to keep them monotonic. We really want to avoid taking3082 * back window allocations. That's legal, but RFC1122 says it's frowned on.3083 * Ack and window will in general have changed since this packet was put3084 * on the write queue.3085 */3086 iph = (structiphdr *)(skb->data +
3087 skb->dev->hard_header_len);
3088 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3089 size = skb->len - (((unsignedchar *) th) - skb->data);
3090
3091 th->ack_seq = ntohl(sk->acked_seq);
3092 th->window = ntohs(tcp_select_window(sk));
3093
3094 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3095
3096 sk->sent_seq = skb->h.seq;
3097
3098 /*3099 * IP manages our queue for some crazy reason3100 */3101
3102 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3103
3104 /*3105 * Again we slide the timer wrongly3106 */3107
3108 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3109 }3110 }3111 }3112
3113
3114 /*3115 * This routine deals with incoming acks, but not outgoing ones.3116 */3117
3118 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3119 {3120 unsignedlongack;
3121 intflag = 0;
3122
3123 /* 3124 * 1 - there was data in packet as well as ack or new data is sent or 3125 * in shutdown state3126 * 2 - data from retransmit queue was acked and removed3127 * 4 - window shrunk or data from retransmit queue was acked and removed3128 */3129
3130 if(sk->zapped)
3131 return(1); /* Dead, cant ack any more so why bother */3132
3133 /*3134 * Have we discovered a larger window3135 */3136
3137 ack = ntohl(th->ack_seq);
3138
3139 if (ntohs(th->window) > sk->max_window)
3140 {3141 sk->max_window = ntohs(th->window);
3142 #ifdefCONFIG_INET_PCTCP3143 /* Hack because we don't send partial packets to non SWS3144 handling hosts */3145 sk->mss = min(sk->max_window>>1, sk->mtu);
3146 #else3147 sk->mss = min(sk->max_window, sk->mtu);
3148 #endif3149 }3150
3151 /*3152 * We have dropped back to keepalive timeouts. Thus we have3153 * no retransmits pending.3154 */3155
3156 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3157 sk->retransmits = 0;
3158
3159 /*3160 * If the ack is newer than sent or older than previous acks3161 * then we can probably ignore it.3162 */3163
3164 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3165 {3166 if(sk->debug)
3167 printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3168
3169 /*3170 * Keepalive processing.3171 */3172
3173 if (after(ack, sk->sent_seq))
3174 {3175 return(0);
3176 }3177
3178 /*3179 * Restart the keepalive timer.3180 */3181
3182 if (sk->keepopen)
3183 {3184 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3185 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3186 }3187 return(1);
3188 }3189
3190 /*3191 * If there is data set flag 13192 */3193
3194 if (len != th->doff*4)
3195 flag |= 1;
3196
3197 /*3198 * See if our window has been shrunk. 3199 */3200
3201 if (after(sk->window_seq, ack+ntohs(th->window)))
3202 {3203 /*3204 * We may need to move packets from the send queue3205 * to the write queue, if the window has been shrunk on us.3206 * The RFC says you are not allowed to shrink your window3207 * like this, but if the other end does, you must be able3208 * to deal with it.3209 */3210 structsk_buff *skb;
3211 structsk_buff *skb2;
3212 structsk_buff *wskb = NULL;
3213
3214 skb2 = sk->send_head;
3215 sk->send_head = NULL;
3216 sk->send_tail = NULL;
3217
3218 /*3219 * This is an artifact of a flawed concept. We want one3220 * queue and a smarter send routine when we send all.3221 */3222
3223 flag |= 4; /* Window changed */3224
3225 sk->window_seq = ack + ntohs(th->window);
3226 cli();
3227 while (skb2 != NULL)
3228 {3229 skb = skb2;
3230 skb2 = skb->link3;
3231 skb->link3 = NULL;
3232 if (after(skb->h.seq, sk->window_seq))
3233 {3234 if (sk->packets_out > 0)
3235 sk->packets_out--;
3236 /* We may need to remove this from the dev send list. */3237 if (skb->next != NULL)
3238 {3239 skb_unlink(skb);
3240 }3241 /* Now add it to the write_queue. */3242 if (wskb == NULL)
3243 skb_queue_head(&sk->write_queue,skb);
3244 else3245 skb_append(wskb,skb);
3246 wskb = skb;
3247 }3248 else3249 {3250 if (sk->send_head == NULL)
3251 {3252 sk->send_head = skb;
3253 sk->send_tail = skb;
3254 }3255 else3256 {3257 sk->send_tail->link3 = skb;
3258 sk->send_tail = skb;
3259 }3260 skb->link3 = NULL;
3261 }3262 }3263 sti();
3264 }3265
3266 /*3267 * Pipe has emptied3268 */3269
3270 if (sk->send_tail == NULL || sk->send_head == NULL)
3271 {3272 sk->send_head = NULL;
3273 sk->send_tail = NULL;
3274 sk->packets_out= 0;
3275 }3276
3277 /*3278 * Update the right hand window edge of the host3279 */3280
3281 sk->window_seq = ack + ntohs(th->window);
3282
3283 /*3284 * We don't want too many packets out there. 3285 */3286
3287 if (sk->ip_xmit_timeout == TIME_WRITE &&
3288 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3289 {3290 /* 3291 * This is Jacobson's slow start and congestion avoidance. 3292 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3293 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3294 * counter and increment it once every cwnd times. It's possible3295 * that this should be done only if sk->retransmits == 0. I'm3296 * interpreting "new data is acked" as including data that has3297 * been retransmitted but is just now being acked.3298 */3299 if (sk->cong_window < sk->ssthresh)
3300 /* 3301 * In "safe" area, increase3302 */3303 sk->cong_window++;
3304 else3305 {3306 /*3307 * In dangerous area, increase slowly. In theory this is3308 * sk->cong_window += 1 / sk->cong_window3309 */3310 if (sk->cong_count >= sk->cong_window)
3311 {3312 sk->cong_window++;
3313 sk->cong_count = 0;
3314 }3315 else3316 sk->cong_count++;
3317 }3318 }3319
3320 /*3321 * Remember the highest ack received.3322 */3323
3324 sk->rcv_ack_seq = ack;
3325
3326 /*3327 * If this ack opens up a zero window, clear backoff. It was3328 * being used to time the probes, and is probably far higher than3329 * it needs to be for normal retransmission.3330 */3331
3332 if (sk->ip_xmit_timeout == TIME_PROBE0)
3333 {3334 sk->retransmits = 0; /* Our probe was answered */3335
3336 /*3337 * Was it a usable window open ?3338 */3339
3340 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3341 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3342 {3343 sk->backoff = 0;
3344
3345 /*3346 * Recompute rto from rtt. this eliminates any backoff.3347 */3348
3349 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3350 if (sk->rto > 120*HZ)
3351 sk->rto = 120*HZ;
3352 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3353 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3354 .2 of a second is going to need huge windows (SIGH) */3355 sk->rto = 20;
3356 }3357 }3358
3359 /* 3360 * See if we can take anything off of the retransmit queue.3361 */3362
3363 while(sk->send_head != NULL)
3364 {3365 /* Check for a bug. */3366 if (sk->send_head->link3 &&
3367 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3368 printk("INET: tcp.c: *** bug send_list out of order.\n");
3369
3370 /*3371 * If our packet is before the ack sequence we can3372 * discard it as it's confirmed to have arrived the other end.3373 */3374
3375 if (before(sk->send_head->h.seq, ack+1))
3376 {3377 structsk_buff *oskb;
3378 if (sk->retransmits)
3379 {3380 /*3381 * We were retransmitting. don't count this in RTT est 3382 */3383 flag |= 2;
3384
3385 /*3386 * even though we've gotten an ack, we're still3387 * retransmitting as long as we're sending from3388 * the retransmit queue. Keeping retransmits non-zero3389 * prevents us from getting new data interspersed with3390 * retransmissions.3391 */3392
3393 if (sk->send_head->link3) /* Any more queued retransmits? */3394 sk->retransmits = 1;
3395 else3396 sk->retransmits = 0;
3397 }3398 /*3399 * Note that we only reset backoff and rto in the3400 * rtt recomputation code. And that doesn't happen3401 * if there were retransmissions in effect. So the3402 * first new packet after the retransmissions is3403 * sent with the backoff still in effect. Not until3404 * we get an ack from a non-retransmitted packet do3405 * we reset the backoff and rto. This allows us to deal3406 * with a situation where the network delay has increased3407 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3408 */3409
3410 /*3411 * We have one less packet out there. 3412 */3413
3414 if (sk->packets_out > 0)
3415 sk->packets_out --;
3416 /* 3417 * Wake up the process, it can probably write more. 3418 */3419 if (!sk->dead)
3420 sk->write_space(sk);
3421 oskb = sk->send_head;
3422
3423 if (!(flag&2)) /* Not retransmitting */3424 {3425 longm;
3426
3427 /*3428 * The following amusing code comes from Jacobson's3429 * article in SIGCOMM '88. Note that rtt and mdev3430 * are scaled versions of rtt and mean deviation.3431 * This is designed to be as fast as possible 3432 * m stands for "measurement".3433 */3434
3435 m = jiffies - oskb->when; /* RTT */3436 if(m<=0)
3437 m=1; /* IS THIS RIGHT FOR <0 ??? */3438 m -= (sk->rtt >> 3); /* m is now error in rtt est */3439 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3440 if (m < 0)
3441 m = -m; /* m is now abs(error) */3442 m -= (sk->mdev >> 2); /* similar update on mdev */3443 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3444
3445 /*3446 * Now update timeout. Note that this removes any backoff.3447 */3448
3449 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3450 if (sk->rto > 120*HZ)
3451 sk->rto = 120*HZ;
3452 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3453 sk->rto = 20;
3454 sk->backoff = 0;
3455 }3456 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3457 In this case as we just set it up */3458 cli();
3459 oskb = sk->send_head;
3460 IS_SKB(oskb);
3461 sk->send_head = oskb->link3;
3462 if (sk->send_head == NULL)
3463 {3464 sk->send_tail = NULL;
3465 }3466
3467 /*3468 * We may need to remove this from the dev send list. 3469 */3470
3471 if (oskb->next)
3472 skb_unlink(oskb);
3473 sti();
3474 kfree_skb(oskb, FREE_WRITE); /* write. */3475 if (!sk->dead)
3476 sk->write_space(sk);
3477 }3478 else3479 {3480 break;
3481 }3482 }3483
3484 /*3485 * XXX someone ought to look at this too.. at the moment, if skb_peek()3486 * returns non-NULL, we complete ignore the timer stuff in the else3487 * clause. We ought to organize the code so that else clause can3488 * (should) be executed regardless, possibly moving the PROBE timer3489 * reset over. The skb_peek() thing should only move stuff to the3490 * write queue, NOT also manage the timer functions.3491 */3492
3493 /*3494 * Maybe we can take some stuff off of the write queue,3495 * and put it onto the xmit queue.3496 */3497 if (skb_peek(&sk->write_queue) != NULL)
3498 {3499 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3500 (sk->retransmits == 0 ||
3501 sk->ip_xmit_timeout != TIME_WRITE ||
3502 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3503 && sk->packets_out < sk->cong_window)
3504 {3505 /*3506 * Add more data to the send queue.3507 */3508 flag |= 1;
3509 tcp_write_xmit(sk);
3510 }3511 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3512 sk->send_head == NULL &&
3513 sk->ack_backlog == 0 &&
3514 sk->state != TCP_TIME_WAIT)
3515 {3516 /*3517 * Data to queue but no room.3518 */3519 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3520 }3521 }3522 else3523 {3524 /*3525 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3526 * from TCP_CLOSE we don't do anything3527 *3528 * from anything else, if there is write data (or fin) pending,3529 * we use a TIME_WRITE timeout, else if keepalive we reset to3530 * a KEEPALIVE timeout, else we delete the timer.3531 *3532 * We do not set flag for nominal write data, otherwise we may3533 * force a state where we start to write itsy bitsy tidbits3534 * of data.3535 */3536
3537 switch(sk->state) {3538 caseTCP_TIME_WAIT:
3539 /*3540 * keep us in TIME_WAIT until we stop getting packets,3541 * reset the timeout.3542 */3543 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3544 break;
3545 caseTCP_CLOSE:
3546 /*3547 * don't touch the timer.3548 */3549 break;
3550 default:
3551 /*3552 * Must check send_head, write_queue, and ack_backlog3553 * to determine which timeout to use.3554 */3555 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3556 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3557 }elseif (sk->keepopen) {3558 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3559 }else{3560 del_timer(&sk->retransmit_timer);
3561 sk->ip_xmit_timeout = 0;
3562 }3563 break;
3564 }3565 }3566
3567 /*3568 * We have nothing queued but space to send. Send any partial3569 * packets immediately (end of Nagle rule application).3570 */3571
3572 if (sk->packets_out == 0 && sk->partial != NULL &&
3573 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3574 {3575 flag |= 1;
3576 tcp_send_partial(sk);
3577 }3578
3579 /*3580 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3581 * we are now waiting for an acknowledge to our FIN. The other end is3582 * already in TIME_WAIT.3583 *3584 * Move to TCP_CLOSE on success.3585 */3586
3587 if (sk->state == TCP_LAST_ACK)
3588 {3589 if (!sk->dead)
3590 sk->state_change(sk);
3591 if(sk->debug)
3592 printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3593 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3594 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3595 {3596 flag |= 1;
3597 tcp_set_state(sk,TCP_CLOSE);
3598 sk->shutdown = SHUTDOWN_MASK;
3599 }3600 }3601
3602 /*3603 * Incoming ACK to a FIN we sent in the case of our initiating the close.3604 *3605 * Move to FIN_WAIT2 to await a FIN from the other end. Set3606 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3607 */3608
3609 if (sk->state == TCP_FIN_WAIT1)
3610 {3611
3612 if (!sk->dead)
3613 sk->state_change(sk);
3614 if (sk->rcv_ack_seq == sk->write_seq)
3615 {3616 flag |= 1;
3617 sk->shutdown |= SEND_SHUTDOWN;
3618 tcp_set_state(sk, TCP_FIN_WAIT2);
3619 }3620 }3621
3622 /*3623 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3624 *3625 * Move to TIME_WAIT3626 */3627
3628 if (sk->state == TCP_CLOSING)
3629 {3630
3631 if (!sk->dead)
3632 sk->state_change(sk);
3633 if (sk->rcv_ack_seq == sk->write_seq)
3634 {3635 flag |= 1;
3636 tcp_time_wait(sk);
3637 }3638 }3639
3640 /*3641 * Final ack of a three way shake 3642 */3643
3644 if(sk->state==TCP_SYN_RECV)
3645 {3646 tcp_set_state(sk, TCP_ESTABLISHED);
3647 tcp_options(sk,th);
3648 sk->dummy_th.dest=th->source;
3649 sk->copied_seq = sk->acked_seq;
3650 if(!sk->dead)
3651 sk->state_change(sk);
3652 if(sk->max_window==0)
3653 {3654 sk->max_window=32; /* Sanity check */3655 sk->mss=min(sk->max_window,sk->mtu);
3656 }3657 }3658
3659 /*3660 * I make no guarantees about the first clause in the following3661 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3662 * what conditions "!flag" would be true. However I think the rest3663 * of the conditions would prevent that from causing any3664 * unnecessary retransmission. 3665 * Clearly if the first packet has expired it should be 3666 * retransmitted. The other alternative, "flag&2 && retransmits", is3667 * harder to explain: You have to look carefully at how and when the3668 * timer is set and with what timeout. The most recent transmission always3669 * sets the timer. So in general if the most recent thing has timed3670 * out, everything before it has as well. So we want to go ahead and3671 * retransmit some more. If we didn't explicitly test for this3672 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3673 * would not be true. If you look at the pattern of timing, you can3674 * show that rto is increased fast enough that the next packet would3675 * almost never be retransmitted immediately. Then you'd end up3676 * waiting for a timeout to send each packet on the retransmission3677 * queue. With my implementation of the Karn sampling algorithm,3678 * the timeout would double each time. The net result is that it would3679 * take a hideous amount of time to recover from a single dropped packet.3680 * It's possible that there should also be a test for TIME_WRITE, but3681 * I think as long as "send_head != NULL" and "retransmit" is on, we've3682 * got to be in real retransmission mode.3683 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3684 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3685 * As long as no further losses occur, this seems reasonable.3686 */3687
3688 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3689 (((flag&2) && sk->retransmits) ||
3690 (sk->send_head->when + sk->rto < jiffies)))
3691 {3692 if(sk->send_head->when + sk->rto < jiffies)
3693 tcp_retransmit(sk,0);
3694 else3695 {3696 tcp_do_retransmit(sk, 1);
3697 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3698 }3699 }3700
3701 return(1);
3702 }3703
3704
3705 /*3706 * Process the FIN bit. This now behaves as it is supposed to work3707 * and the FIN takes effect when it is validly part of sequence3708 * space. Not before when we get holes.3709 *3710 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3711 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3712 * TIME-WAIT)3713 *3714 * If we are in FINWAIT-1, a received FIN indicates simultaneous3715 * close and we go into CLOSING (and later onto TIME-WAIT)3716 *3717 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3718 *3719 */3720
3721 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3722 {3723 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3724
3725 if (!sk->dead)
3726 {3727 sk->state_change(sk);
3728 sock_wake_async(sk->socket, 1);
3729 }3730
3731 switch(sk->state)
3732 {3733 caseTCP_SYN_RECV:
3734 caseTCP_SYN_SENT:
3735 caseTCP_ESTABLISHED:
3736 /*3737 * move to CLOSE_WAIT, tcp_data() already handled3738 * sending the ack.3739 */3740 tcp_set_state(sk,TCP_CLOSE_WAIT);
3741 if (th->rst)
3742 sk->shutdown = SHUTDOWN_MASK;
3743 break;
3744
3745 caseTCP_CLOSE_WAIT:
3746 caseTCP_CLOSING:
3747 /*3748 * received a retransmission of the FIN, do3749 * nothing.3750 */3751 break;
3752 caseTCP_TIME_WAIT:
3753 /*3754 * received a retransmission of the FIN,3755 * restart the TIME_WAIT timer.3756 */3757 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3758 return(0);
3759 caseTCP_FIN_WAIT1:
3760 /*3761 * This case occurs when a simultaneous close3762 * happens, we must ack the received FIN and3763 * enter the CLOSING state.3764 *3765 * This causes a WRITE timeout, which will either3766 * move on to TIME_WAIT when we timeout, or resend3767 * the FIN properly (maybe we get rid of that annoying3768 * FIN lost hang). The TIME_WRITE code is already correct3769 * for handling this timeout.3770 */3771
3772 if(sk->ip_xmit_timeout != TIME_WRITE)
3773 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3774 tcp_set_state(sk,TCP_CLOSING);
3775 break;
3776 caseTCP_FIN_WAIT2:
3777 /*3778 * received a FIN -- send ACK and enter TIME_WAIT3779 */3780 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3781 sk->shutdown|=SHUTDOWN_MASK;
3782 tcp_set_state(sk,TCP_TIME_WAIT);
3783 break;
3784 caseTCP_CLOSE:
3785 /*3786 * already in CLOSE3787 */3788 break;
3789 default:
3790 tcp_set_state(sk,TCP_LAST_ACK);
3791
3792 /* Start the timers. */3793 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3794 return(0);
3795 }3796
3797 return(0);
3798 }3799
3800
3801
3802 /*3803 * This routine handles the data. If there is room in the buffer,3804 * it will be have already been moved into it. If there is no3805 * room, then we will just have to discard the packet.3806 */3807
3808 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */3809 unsignedlongsaddr, unsignedshortlen)
3810 {3811 structsk_buff *skb1, *skb2;
3812 structtcphdr *th;
3813 intdup_dumped=0;
3814 unsignedlongnew_seq;
3815 unsignedlongshut_seq;
3816
3817 th = skb->h.th;
3818 skb->len = len -(th->doff*4);
3819
3820 /*3821 * The bytes in the receive read/assembly queue has increased. Needed for the3822 * low memory discard algorithm 3823 */3824
3825 sk->bytes_rcv += skb->len;
3826
3827 if (skb->len == 0 && !th->fin)
3828 {3829 /* 3830 * Don't want to keep passing ack's back and forth. 3831 * (someone sent us dataless, boring frame)3832 */3833 if (!th->ack)
3834 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3835 kfree_skb(skb, FREE_READ);
3836 return(0);
3837 }3838
3839 /*3840 * We no longer have anyone receiving data on this connection.3841 */3842
3843 #ifndef TCP_DONT_RST_SHUTDOWN
3844
3845 if(sk->shutdown & RCV_SHUTDOWN)
3846 {3847 /*3848 * FIXME: BSD has some magic to avoid sending resets to3849 * broken 4.2 BSD keepalives. Much to my surprise a few non3850 * BSD stacks still have broken keepalives so we want to3851 * cope with it.3852 */3853
3854 if(skb->len) /* We don't care if it's just an ack or3855 a keepalive/window probe */3856 {3857 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */3858
3859 /* Do this the way 4.4BSD treats it. Not what I'd3860 regard as the meaning of the spec but it's what BSD3861 does and clearly they know everything 8) */3862
3863 /*3864 * This is valid because of two things3865 *3866 * a) The way tcp_data behaves at the bottom.3867 * b) A fin takes effect when read not when received.3868 */3869
3870 shut_seq=sk->acked_seq+1; /* Last byte */3871
3872 if(after(new_seq,shut_seq))
3873 {3874 if(sk->debug)
3875 printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3876 sk, new_seq, shut_seq, sk->blog);
3877 if(sk->dead)
3878 {3879 sk->acked_seq = new_seq + th->fin;
3880 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3881 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3882 tcp_statistics.TcpEstabResets++;
3883 tcp_set_state(sk,TCP_CLOSE);
3884 sk->err = EPIPE;
3885 sk->shutdown = SHUTDOWN_MASK;
3886 kfree_skb(skb, FREE_READ);
3887 return 0;
3888 }3889 }3890 }3891 }3892
3893 #endif3894
3895 /*3896 * Now we have to walk the chain, and figure out where this one3897 * goes into it. This is set up so that the last packet we received3898 * will be the first one we look at, that way if everything comes3899 * in order, there will be no performance loss, and if they come3900 * out of order we will be able to fit things in nicely.3901 *3902 * [AC: This is wrong. We should assume in order first and then walk3903 * forwards from the first hole based upon real traffic patterns.]3904 * 3905 */3906
3907 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */3908 {3909 skb_queue_head(&sk->receive_queue,skb);
3910 skb1= NULL;
3911 }3912 else3913 {3914 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3915 {3916 if(sk->debug)
3917 {3918 printk("skb1=%p :", skb1);
3919 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3920 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3921 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3922 sk->acked_seq);
3923 }3924
3925 /*3926 * Optimisation: Duplicate frame or extension of previous frame from3927 * same sequence point (lost ack case).3928 * The frame contains duplicate data or replaces a previous frame3929 * discard the previous frame (safe as sk->inuse is set) and put3930 * the new one in its place.3931 */3932
3933 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3934 {3935 skb_append(skb1,skb);
3936 skb_unlink(skb1);
3937 kfree_skb(skb1,FREE_READ);
3938 dup_dumped=1;
3939 skb1=NULL;
3940 break;
3941 }3942
3943 /*3944 * Found where it fits3945 */3946
3947 if (after(th->seq+1, skb1->h.th->seq))
3948 {3949 skb_append(skb1,skb);
3950 break;
3951 }3952
3953 /*3954 * See if we've hit the start. If so insert.3955 */3956 if (skb1 == skb_peek(&sk->receive_queue))
3957 {3958 skb_queue_head(&sk->receive_queue, skb);
3959 break;
3960 }3961 }3962 }3963
3964 /*3965 * Figure out what the ack value for this frame is3966 */3967
3968 th->ack_seq = th->seq + skb->len;
3969 if (th->syn)
3970 th->ack_seq++;
3971 if (th->fin)
3972 th->ack_seq++;
3973
3974 if (before(sk->acked_seq, sk->copied_seq))
3975 {3976 printk("*** tcp.c:tcp_data bug acked < copied\n");
3977 sk->acked_seq = sk->copied_seq;
3978 }3979
3980 /*3981 * Now figure out if we can ack anything. This is very messy because we really want two3982 * receive queues, a completed and an assembly queue. We also want only one transmit3983 * queue.3984 */3985
3986 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3987 {3988 if (before(th->seq, sk->acked_seq+1))
3989 {3990 intnewwindow;
3991
3992 if (after(th->ack_seq, sk->acked_seq))
3993 {3994 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3995 if (newwindow < 0)
3996 newwindow = 0;
3997 sk->window = newwindow;
3998 sk->acked_seq = th->ack_seq;
3999 }4000 skb->acked = 1;
4001
4002 /*4003 * When we ack the fin, we do the FIN 4004 * processing.4005 */4006
4007 if (skb->h.th->fin)
4008 {4009 tcp_fin(skb,sk,skb->h.th);
4010 }4011
4012 for(skb2 = skb->next;
4013 skb2 != (structsk_buff *)&sk->receive_queue;
4014 skb2 = skb2->next)
4015 {4016 if (before(skb2->h.th->seq, sk->acked_seq+1))
4017 {4018 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4019 {4020 newwindow = sk->window -
4021 (skb2->h.th->ack_seq - sk->acked_seq);
4022 if (newwindow < 0)
4023 newwindow = 0;
4024 sk->window = newwindow;
4025 sk->acked_seq = skb2->h.th->ack_seq;
4026 }4027 skb2->acked = 1;
4028 /*4029 * When we ack the fin, we do4030 * the fin handling.4031 */4032 if (skb2->h.th->fin)
4033 {4034 tcp_fin(skb,sk,skb->h.th);
4035 }4036
4037 /*4038 * Force an immediate ack.4039 */4040
4041 sk->ack_backlog = sk->max_ack_backlog;
4042 }4043 else4044 {4045 break;
4046 }4047 }4048
4049 /*4050 * This also takes care of updating the window.4051 * This if statement needs to be simplified.4052 */4053 if (!sk->delay_acks ||
4054 sk->ack_backlog >= sk->max_ack_backlog ||
4055 sk->bytes_rcv > sk->max_unacked || th->fin) {4056 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4057 }4058 else4059 {4060 sk->ack_backlog++;
4061 if(sk->debug)
4062 printk("Ack queued.\n");
4063 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4064 }4065 }4066 }4067
4068 /*4069 * If we've missed a packet, send an ack.4070 * Also start a timer to send another.4071 */4072
4073 if (!skb->acked)
4074 {4075
4076 /*4077 * This is important. If we don't have much room left,4078 * we need to throw out a few packets so we have a good4079 * window. Note that mtu is used, not mss, because mss is really4080 * for the send side. He could be sending us stuff as large as mtu.4081 */4082
4083 while (sk->prot->rspace(sk) < sk->mtu)
4084 {4085 skb1 = skb_peek(&sk->receive_queue);
4086 if (skb1 == NULL)
4087 {4088 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4089 break;
4090 }4091
4092 /*4093 * Don't throw out something that has been acked. 4094 */4095
4096 if (skb1->acked)
4097 {4098 break;
4099 }4100
4101 skb_unlink(skb1);
4102 kfree_skb(skb1, FREE_READ);
4103 }4104 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4105 sk->ack_backlog++;
4106 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4107 }4108 else4109 {4110 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4111 }4112
4113 /*4114 * Now tell the user we may have some data. 4115 */4116
4117 if (!sk->dead)
4118 {4119 if(sk->debug)
4120 printk("Data wakeup.\n");
4121 sk->data_ready(sk,0);
4122 }4123 return(0);
4124 }4125
4126
4127 /*4128 * This routine is only called when we have urgent data4129 * signalled. Its the 'slow' part of tcp_urg. It could be4130 * moved inline now as tcp_urg is only called from one4131 * place. We handle URGent data wrong. We have to - as4132 * BSD still doesn't use the correction from RFC961.4133 */4134
4135 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4136 {4137 unsignedlongptr = ntohs(th->urg_ptr);
4138
4139 if (ptr)
4140 ptr--;
4141 ptr += th->seq;
4142
4143 /* ignore urgent data that we've already seen and read */4144 if (after(sk->copied_seq, ptr))
4145 return;
4146
4147 /* do we already have a newer (or duplicate) urgent pointer? */4148 if (sk->urg_data && !after(ptr, sk->urg_seq))
4149 return;
4150
4151 /* tell the world about our new urgent pointer */4152 if (sk->proc != 0) {4153 if (sk->proc > 0) {4154 kill_proc(sk->proc, SIGURG, 1);
4155 }else{4156 kill_pg(-sk->proc, SIGURG, 1);
4157 }4158 }4159 sk->urg_data = URG_NOTYET;
4160 sk->urg_seq = ptr;
4161 }4162
4163 /*4164 * This is the 'fast' part of urgent handling.4165 */4166
4167 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4168 unsignedlongsaddr, unsignedlonglen)
4169 {4170 unsignedlongptr;
4171
4172 /*4173 * Check if we get a new urgent pointer - normally not 4174 */4175
4176 if (th->urg)
4177 tcp_check_urg(sk,th);
4178
4179 /*4180 * Do we wait for any urgent data? - normally not4181 */4182
4183 if (sk->urg_data != URG_NOTYET)
4184 return 0;
4185
4186 /*4187 * Is the urgent pointer pointing into this packet? 4188 */4189
4190 ptr = sk->urg_seq - th->seq + th->doff*4;
4191 if (ptr >= len)
4192 return 0;
4193
4194 /*4195 * Ok, got the correct packet, update info 4196 */4197
4198 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4199 if (!sk->dead)
4200 sk->data_ready(sk,0);
4201 return 0;
4202 }4203
4204 /*4205 * This will accept the next outstanding connection. 4206 */4207
4208 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4209 {4210 structsock *newsk;
4211 structsk_buff *skb;
4212
4213 /*4214 * We need to make sure that this socket is listening,4215 * and that it has something pending.4216 */4217
4218 if (sk->state != TCP_LISTEN)
4219 {4220 sk->err = EINVAL;
4221 return(NULL);
4222 }4223
4224 /* Avoid the race. */4225 cli();
4226 sk->inuse = 1;
4227
4228 while((skb = tcp_dequeue_established(sk)) == NULL)
4229 {4230 if (flags & O_NONBLOCK)
4231 {4232 sti();
4233 release_sock(sk);
4234 sk->err = EAGAIN;
4235 return(NULL);
4236 }4237
4238 release_sock(sk);
4239 interruptible_sleep_on(sk->sleep);
4240 if (current->signal & ~current->blocked)
4241 {4242 sti();
4243 sk->err = ERESTARTSYS;
4244 return(NULL);
4245 }4246 sk->inuse = 1;
4247 }4248 sti();
4249
4250 /*4251 * Now all we need to do is return skb->sk. 4252 */4253
4254 newsk = skb->sk;
4255
4256 kfree_skb(skb, FREE_READ);
4257 sk->ack_backlog--;
4258 release_sock(sk);
4259 return(newsk);
4260 }4261
4262
4263 /*4264 * This will initiate an outgoing connection. 4265 */4266
4267 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4268 {4269 structsk_buff *buff;
4270 structdevice *dev=NULL;
4271 unsignedchar *ptr;
4272 inttmp;
4273 intatype;
4274 structtcphdr *t1;
4275 structrtable *rt;
4276
4277 if (sk->state != TCP_CLOSE)
4278 {4279 return(-EISCONN);
4280 }4281
4282 if (addr_len < 8)
4283 return(-EINVAL);
4284
4285 if (usin->sin_family && usin->sin_family != AF_INET)
4286 return(-EAFNOSUPPORT);
4287
4288 /*4289 * connect() to INADDR_ANY means loopback (BSD'ism).4290 */4291
4292 if(usin->sin_addr.s_addr==INADDR_ANY)
4293 usin->sin_addr.s_addr=ip_my_addr();
4294
4295 /*4296 * Don't want a TCP connection going to a broadcast address 4297 */4298
4299 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4300 return -ENETUNREACH;
4301
4302 sk->inuse = 1;
4303 sk->daddr = usin->sin_addr.s_addr;
4304 sk->write_seq = tcp_init_seq();
4305 sk->window_seq = sk->write_seq;
4306 sk->rcv_ack_seq = sk->write_seq -1;
4307 sk->err = 0;
4308 sk->dummy_th.dest = usin->sin_port;
4309 release_sock(sk);
4310
4311 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4312 if (buff == NULL)
4313 {4314 return(-ENOMEM);
4315 }4316 sk->inuse = 1;
4317 buff->len = 24;
4318 buff->sk = sk;
4319 buff->free = 0;
4320 buff->localroute = sk->localroute;
4321
4322 t1 = (structtcphdr *) buff->data;
4323
4324 /*4325 * Put in the IP header and routing stuff. 4326 */4327
4328 rt=ip_rt_route(sk->daddr, NULL, NULL);
4329
4330
4331 /*4332 * We need to build the routing stuff from the things saved in skb. 4333 */4334
4335 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4336 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4337 if (tmp < 0)
4338 {4339 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4340 release_sock(sk);
4341 return(-ENETUNREACH);
4342 }4343
4344 buff->len += tmp;
4345 t1 = (structtcphdr *)((char *)t1 +tmp);
4346
4347 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4348 t1->seq = ntohl(sk->write_seq++);
4349 sk->sent_seq = sk->write_seq;
4350 buff->h.seq = sk->write_seq;
4351 t1->ack = 0;
4352 t1->window = 2;
4353 t1->res1=0;
4354 t1->res2=0;
4355 t1->rst = 0;
4356 t1->urg = 0;
4357 t1->psh = 0;
4358 t1->syn = 1;
4359 t1->urg_ptr = 0;
4360 t1->doff = 6;
4361 /* use 512 or whatever user asked for */4362
4363 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4364 sk->window_clamp=rt->rt_window;
4365 else4366 sk->window_clamp=0;
4367
4368 if (sk->user_mss)
4369 sk->mtu = sk->user_mss;
4370 elseif(rt!=NULL && (rt->rt_flags&RTF_MTU))
4371 sk->mtu = rt->rt_mss;
4372 else4373 {4374 #ifdefCONFIG_INET_SNARL4375 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4376 #else4377 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4378 #endif4379 sk->mtu = 576 - HEADER_SIZE;
4380 else4381 sk->mtu = MAX_WINDOW;
4382 }4383 /*4384 * but not bigger than device MTU 4385 */4386
4387 if(sk->mtu <32)
4388 sk->mtu = 32; /* Sanity limit */4389
4390 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4391
4392 /*4393 * Put in the TCP options to say MTU. 4394 */4395
4396 ptr = (unsignedchar *)(t1+1);
4397 ptr[0] = 2;
4398 ptr[1] = 4;
4399 ptr[2] = (sk->mtu) >> 8;
4400 ptr[3] = (sk->mtu) & 0xff;
4401 tcp_send_check(t1, sk->saddr, sk->daddr,
4402 sizeof(structtcphdr) + 4, sk);
4403
4404 /*4405 * This must go first otherwise a really quick response will get reset. 4406 */4407
4408 tcp_set_state(sk,TCP_SYN_SENT);
4409 sk->rto = TCP_TIMEOUT_INIT;
4410 #if 0 /* we already did this */4411 init_timer(&sk->retransmit_timer);
4412 #endif4413 sk->retransmit_timer.function=&retransmit_timer;
4414 sk->retransmit_timer.data = (unsignedlong)sk;
4415 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4416 sk->retransmits = TCP_SYN_RETRIES;
4417
4418 sk->prot->queue_xmit(sk, dev, buff, 0);
4419 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4420 tcp_statistics.TcpActiveOpens++;
4421 tcp_statistics.TcpOutSegs++;
4422
4423 release_sock(sk);
4424 return(0);
4425 }4426
4427
4428 /* This functions checks to see if the tcp header is actually acceptable. */4429 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4430 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4431 {4432 unsignedlongnext_seq;
4433
4434 next_seq = len - 4*th->doff;
4435 if (th->fin)
4436 next_seq++;
4437 /* if we have a zero window, we can't have any data in the packet.. */4438 if (next_seq && !sk->window)
4439 gotoignore_it;
4440 next_seq += th->seq;
4441
4442 /*4443 * This isn't quite right. sk->acked_seq could be more recent4444 * than sk->window. This is however close enough. We will accept4445 * slightly more packets than we should, but it should not cause4446 * problems unless someone is trying to forge packets.4447 */4448
4449 /* have we already seen all of this packet? */4450 if (!after(next_seq+1, sk->acked_seq))
4451 gotoignore_it;
4452 /* or does it start beyond the window? */4453 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4454 gotoignore_it;
4455
4456 /* ok, at least part of this packet would seem interesting.. */4457 return 1;
4458
4459 ignore_it:
4460 if (th->rst)
4461 return 0;
4462
4463 /*4464 * Send a reset if we get something not ours and we are4465 * unsynchronized. Note: We don't do anything to our end. We4466 * are just killing the bogus remote connection then we will4467 * connect again and it will work (with luck).4468 */4469
4470 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4471 {4472 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4473 return 1;
4474 }4475
4476 /* Try to resync things. */4477 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4478 return 0;
4479 }4480
4481 /*4482 * When we get a reset we do this.4483 */4484
4485 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4486 {4487 sk->zapped = 1;
4488 sk->err = ECONNRESET;
4489 if (sk->state == TCP_SYN_SENT)
4490 sk->err = ECONNREFUSED;
4491 if (sk->state == TCP_CLOSE_WAIT)
4492 sk->err = EPIPE;
4493 #ifdef TCP_DO_RFC1337
4494 /*4495 * Time wait assassination protection [RFC1337]4496 */4497 if(sk->state!=TCP_TIME_WAIT)
4498 {4499 tcp_set_state(sk,TCP_CLOSE);
4500 sk->shutdown = SHUTDOWN_MASK;
4501 }4502 #else4503 tcp_set_state(sk,TCP_CLOSE);
4504 sk->shutdown = SHUTDOWN_MASK;
4505 #endif4506 if (!sk->dead)
4507 sk->state_change(sk);
4508 kfree_skb(skb, FREE_READ);
4509 release_sock(sk);
4510 return(0);
4511 }4512
4513 /*4514 * A TCP packet has arrived.4515 */4516
4517 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4518 unsignedlongdaddr, unsignedshortlen,
4519 unsignedlongsaddr, intredo, structinet_protocol * protocol)
4520 {4521 structtcphdr *th;
4522 structsock *sk;
4523 intsyn_ok=0;
4524
4525 if (!skb)
4526 {4527 printk("IMPOSSIBLE 1\n");
4528 return(0);
4529 }4530
4531 if (!dev)
4532 {4533 printk("IMPOSSIBLE 2\n");
4534 return(0);
4535 }4536
4537 tcp_statistics.TcpInSegs++;
4538
4539 if(skb->pkt_type!=PACKET_HOST)
4540 {4541 kfree_skb(skb,FREE_READ);
4542 return(0);
4543 }4544
4545 th = skb->h.th;
4546
4547 /*4548 * Find the socket.4549 */4550
4551 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4552
4553 /*4554 * If this socket has got a reset it's to all intents and purposes 4555 * really dead. Count closed sockets as dead.4556 *4557 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4558 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4559 * exist so should cause resets as if the port was unreachable.4560 */4561
4562 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4563 sk=NULL;
4564
4565 if (!redo)
4566 {4567 if (tcp_check(th, len, saddr, daddr ))
4568 {4569 skb->sk = NULL;
4570 kfree_skb(skb,FREE_READ);
4571 /*4572 * We don't release the socket because it was4573 * never marked in use.4574 */4575 return(0);
4576 }4577 th->seq = ntohl(th->seq);
4578
4579 /* See if we know about the socket. */4580 if (sk == NULL)
4581 {4582 /*4583 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4584 */4585 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4586 skb->sk = NULL;
4587 /*4588 * Discard frame4589 */4590 kfree_skb(skb, FREE_READ);
4591 return(0);
4592 }4593
4594 skb->len = len;
4595 skb->acked = 0;
4596 skb->used = 0;
4597 skb->free = 0;
4598 skb->saddr = daddr;
4599 skb->daddr = saddr;
4600
4601 /* We may need to add it to the backlog here. */4602 cli();
4603 if (sk->inuse)
4604 {4605 skb_queue_tail(&sk->back_log, skb);
4606 sti();
4607 return(0);
4608 }4609 sk->inuse = 1;
4610 sti();
4611 }4612 else4613 {4614 if (sk==NULL)
4615 {4616 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4617 skb->sk = NULL;
4618 kfree_skb(skb, FREE_READ);
4619 return(0);
4620 }4621 }4622
4623
4624 if (!sk->prot)
4625 {4626 printk("IMPOSSIBLE 3\n");
4627 return(0);
4628 }4629
4630
4631 /*4632 * Charge the memory to the socket. 4633 */4634
4635 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
4636 {4637 kfree_skb(skb, FREE_READ);
4638 release_sock(sk);
4639 return(0);
4640 }4641
4642 skb->sk=sk;
4643 sk->rmem_alloc += skb->mem_len;
4644
4645 /*4646 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4647 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4648 * compatibility. We also set up variables more thoroughly [Karn notes in the4649 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4650 */4651
4652 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4653 {4654
4655 /*4656 * Now deal with unusual cases.4657 */4658
4659 if(sk->state==TCP_LISTEN)
4660 {4661 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4662 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4663
4664 /*4665 * We don't care for RST, and non SYN are absorbed (old segments)4666 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4667 * netmask on a running connection it can go broadcast. Even Sun's have4668 * this problem so I'm ignoring it 4669 */4670
4671 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4672 {4673 kfree_skb(skb, FREE_READ);
4674 release_sock(sk);
4675 return 0;
4676 }4677
4678 /* 4679 * Guess we need to make a new socket up 4680 */4681
4682 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4683
4684 /*4685 * Now we have several options: In theory there is nothing else4686 * in the frame. KA9Q has an option to send data with the syn,4687 * BSD accepts data with the syn up to the [to be] advertised window4688 * and Solaris 2.1 gives you a protocol error. For now we just ignore4689 * it, that fits the spec precisely and avoids incompatibilities. It4690 * would be nice in future to drop through and process the data.4691 */4692
4693 release_sock(sk);
4694 return 0;
4695 }4696
4697 /* retransmitted SYN? */4698 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4699 {4700 kfree_skb(skb, FREE_READ);
4701 release_sock(sk);
4702 return 0;
4703 }4704
4705 /*4706 * SYN sent means we have to look for a suitable ack and either reset4707 * for bad matches or go to connected 4708 */4709
4710 if(sk->state==TCP_SYN_SENT)
4711 {4712 /* Crossed SYN or previous junk segment */4713 if(th->ack)
4714 {4715 /* We got an ack, but it's not a good ack */4716 if(!tcp_ack(sk,th,saddr,len))
4717 {4718 /* Reset the ack - its an ack from a 4719 different connection [ th->rst is checked in tcp_reset()] */4720 tcp_statistics.TcpAttemptFails++;
4721 tcp_reset(daddr, saddr, th,
4722 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4723 kfree_skb(skb, FREE_READ);
4724 release_sock(sk);
4725 return(0);
4726 }4727 if(th->rst)
4728 returntcp_std_reset(sk,skb);
4729 if(!th->syn)
4730 {4731 /* A valid ack from a different connection4732 start. Shouldn't happen but cover it */4733 kfree_skb(skb, FREE_READ);
4734 release_sock(sk);
4735 return 0;
4736 }4737 /*4738 * Ok.. it's good. Set up sequence numbers and4739 * move to established.4740 */4741 syn_ok=1; /* Don't reset this connection for the syn */4742 sk->acked_seq=th->seq+1;
4743 sk->fin_seq=th->seq;
4744 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4745 tcp_set_state(sk, TCP_ESTABLISHED);
4746 tcp_options(sk,th);
4747 sk->dummy_th.dest=th->source;
4748 sk->copied_seq = sk->acked_seq;
4749 if(!sk->dead)
4750 {4751 sk->state_change(sk);
4752 sock_wake_async(sk->socket, 0);
4753 }4754 if(sk->max_window==0)
4755 {4756 sk->max_window = 32;
4757 sk->mss = min(sk->max_window, sk->mtu);
4758 }4759 }4760 else4761 {4762 /* See if SYN's cross. Drop if boring */4763 if(th->syn && !th->rst)
4764 {4765 /* Crossed SYN's are fine - but talking to4766 yourself is right out... */4767 if(sk->saddr==saddr && sk->daddr==daddr &&
4768 sk->dummy_th.source==th->source &&
4769 sk->dummy_th.dest==th->dest)
4770 {4771 tcp_statistics.TcpAttemptFails++;
4772 returntcp_std_reset(sk,skb);
4773 }4774 tcp_set_state(sk,TCP_SYN_RECV);
4775
4776 /*4777 * FIXME:4778 * Must send SYN|ACK here4779 */4780 }4781 /* Discard junk segment */4782 kfree_skb(skb, FREE_READ);
4783 release_sock(sk);
4784 return 0;
4785 }4786 /*4787 * SYN_RECV with data maybe.. drop through4788 */4789 gotorfc_step6;
4790 }4791
4792 /*4793 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is4794 * a more complex suggestion for fixing these reuse issues in RFC16444795 * but not yet ready for general use. Also see RFC1379.4796 */4797
4798 #defineBSD_TIME_WAIT4799 #ifdefBSD_TIME_WAIT4800 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4801 after(th->seq, sk->acked_seq) && !th->rst)
4802 {4803 longseq=sk->write_seq;
4804 if(sk->debug)
4805 printk("Doing a BSD time wait\n");
4806 tcp_statistics.TcpEstabResets++;
4807 sk->rmem_alloc -= skb->mem_len;
4808 skb->sk = NULL;
4809 sk->err=ECONNRESET;
4810 tcp_set_state(sk, TCP_CLOSE);
4811 sk->shutdown = SHUTDOWN_MASK;
4812 release_sock(sk);
4813 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4814 if (sk && sk->state==TCP_LISTEN)
4815 {4816 sk->inuse=1;
4817 skb->sk = sk;
4818 sk->rmem_alloc += skb->mem_len;
4819 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4820 release_sock(sk);
4821 return 0;
4822 }4823 kfree_skb(skb, FREE_READ);
4824 return 0;
4825 }4826 #endif4827 }4828
4829 /*4830 * We are now in normal data flow (see the step list in the RFC)4831 * Note most of these are inline now. I'll inline the lot when4832 * I have time to test it hard and look at what gcc outputs 4833 */4834
4835 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4836 {4837 kfree_skb(skb, FREE_READ);
4838 release_sock(sk);
4839 return 0;
4840 }4841
4842 if(th->rst)
4843 returntcp_std_reset(sk,skb);
4844
4845 /*4846 * !syn_ok is effectively the state test in RFC793.4847 */4848
4849 if(th->syn && !syn_ok)
4850 {4851 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4852 returntcp_std_reset(sk,skb);
4853 }4854
4855 /*4856 * Process the ACK4857 */4858
4859
4860 if(th->ack && !tcp_ack(sk,th,saddr,len))
4861 {4862 /*4863 * Our three way handshake failed.4864 */4865
4866 if(sk->state==TCP_SYN_RECV)
4867 {4868 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4869 }4870 kfree_skb(skb, FREE_READ);
4871 release_sock(sk);
4872 return 0;
4873 }4874
4875 rfc_step6: /* I'll clean this up later */4876
4877 /*4878 * Process urgent data4879 */4880
4881 if(tcp_urg(sk, th, saddr, len))
4882 {4883 kfree_skb(skb, FREE_READ);
4884 release_sock(sk);
4885 return 0;
4886 }4887
4888
4889 /*4890 * Process the encapsulated data4891 */4892
4893 if(tcp_data(skb,sk, saddr, len))
4894 {4895 kfree_skb(skb, FREE_READ);
4896 release_sock(sk);
4897 return 0;
4898 }4899
4900 /*4901 * And done4902 */4903
4904 release_sock(sk);
4905 return 0;
4906 }4907
4908 /*4909 * This routine sends a packet with an out of date sequence4910 * number. It assumes the other end will try to ack it.4911 */4912
4913 staticvoidtcp_write_wakeup(structsock *sk)
/* */4914 {4915 structsk_buff *buff;
4916 structtcphdr *t1;
4917 structdevice *dev=NULL;
4918 inttmp;
4919
4920 if (sk->zapped)
4921 return; /* After a valid reset we can send no more */4922
4923 /*4924 * Write data can still be transmitted/retransmitted in the4925 * following states. If any other state is encountered, return.4926 * [listen/close will never occur here anyway]4927 */4928
4929 if (sk->state != TCP_ESTABLISHED &&
4930 sk->state != TCP_CLOSE_WAIT &&
4931 sk->state != TCP_FIN_WAIT1 &&
4932 sk->state != TCP_LAST_ACK &&
4933 sk->state != TCP_CLOSING4934 )
4935 {4936 return;
4937 }4938
4939 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4940 if (buff == NULL)
4941 return;
4942
4943 buff->len = sizeof(structtcphdr);
4944 buff->free = 1;
4945 buff->sk = sk;
4946 buff->localroute = sk->localroute;
4947
4948 t1 = (structtcphdr *) buff->data;
4949
4950 /* Put in the IP header and routing stuff. */4951 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4952 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4953 if (tmp < 0)
4954 {4955 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4956 return;
4957 }4958
4959 buff->len += tmp;
4960 t1 = (structtcphdr *)((char *)t1 +tmp);
4961
4962 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4963
4964 /*4965 * Use a previous sequence.4966 * This should cause the other end to send an ack.4967 */4968
4969 t1->seq = htonl(sk->sent_seq-1);
4970 t1->ack = 1;
4971 t1->res1= 0;
4972 t1->res2= 0;
4973 t1->rst = 0;
4974 t1->urg = 0;
4975 t1->psh = 0;
4976 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */4977 t1->syn = 0;
4978 t1->ack_seq = ntohl(sk->acked_seq);
4979 t1->window = ntohs(tcp_select_window(sk));
4980 t1->doff = sizeof(*t1)/4;
4981 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4982 /*4983 * Send it and free it.4984 * This will prevent the timer from automatically being restarted.4985 */4986 sk->prot->queue_xmit(sk, dev, buff, 1);
4987 tcp_statistics.TcpOutSegs++;
4988 }4989
4990 /*4991 * A window probe timeout has occurred.4992 */4993
4994 voidtcp_send_probe0(structsock *sk)
/* */4995 {4996 if (sk->zapped)
4997 return; /* After a valid reset we can send no more */4998
4999 tcp_write_wakeup(sk);
5000
5001 sk->backoff++;
5002 sk->rto = min(sk->rto << 1, 120*HZ);
5003 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5004 sk->retransmits++;
5005 sk->prot->retransmits ++;
5006 }5007
5008 /*5009 * Socket option code for TCP. 5010 */5011
5012 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5013 {5014 intval,err;
5015
5016 if(level!=SOL_TCP)
5017 returnip_setsockopt(sk,level,optname,optval,optlen);
5018
5019 if (optval == NULL)
5020 return(-EINVAL);
5021
5022 err=verify_area(VERIFY_READ, optval, sizeof(int));
5023 if(err)
5024 returnerr;
5025
5026 val = get_fs_long((unsignedlong *)optval);
5027
5028 switch(optname)
5029 {5030 caseTCP_MAXSEG:
5031 /*5032 * values greater than interface MTU won't take effect. however at5033 * the point when this call is done we typically don't yet know5034 * which interface is going to be used5035 */5036 if(val<1||val>MAX_WINDOW)
5037 return -EINVAL;
5038 sk->user_mss=val;
5039 return 0;
5040 caseTCP_NODELAY:
5041 sk->nonagle=(val==0)?0:1;
5042 return 0;
5043 default:
5044 return(-ENOPROTOOPT);
5045 }5046 }5047
5048 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5049 {5050 intval,err;
5051
5052 if(level!=SOL_TCP)
5053 returnip_getsockopt(sk,level,optname,optval,optlen);
5054
5055 switch(optname)
5056 {5057 caseTCP_MAXSEG:
5058 val=sk->user_mss;
5059 break;
5060 caseTCP_NODELAY:
5061 val=sk->nonagle;
5062 break;
5063 default:
5064 return(-ENOPROTOOPT);
5065 }5066 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5067 if(err)
5068 returnerr;
5069 put_fs_long(sizeof(int),(unsignedlong *) optlen);
5070
5071 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5072 if(err)
5073 returnerr;
5074 put_fs_long(val,(unsignedlong *)optval);
5075
5076 return(0);
5077 }5078
5079
5080 structprototcp_prot = {5081 sock_wmalloc,
5082 sock_rmalloc,
5083 sock_wfree,
5084 sock_rfree,
5085 sock_rspace,
5086 sock_wspace,
5087 tcp_close,
5088 tcp_read,
5089 tcp_write,
5090 tcp_sendto,
5091 tcp_recvfrom,
5092 ip_build_header,
5093 tcp_connect,
5094 tcp_accept,
5095 ip_queue_xmit,
5096 tcp_retransmit,
5097 tcp_write_wakeup,
5098 tcp_read_wakeup,
5099 tcp_rcv,
5100 tcp_select,
5101 tcp_ioctl,
5102 NULL,
5103 tcp_shutdown,
5104 tcp_setsockopt,
5105 tcp_getsockopt,
5106 128,
5107 0,
5108 {NULL,},
5109 "TCP",
5110 0, 0
5111 };