1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@no.unit.nvg> 20 * 21 * Fixes: 22 * Alan Cox : Numerous verify_area() calls 23 * Alan Cox : Set the ACK bit on a reset 24 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 25 * and was trying to connect (tcp_err()). 26 * Alan Cox : All icmp error handling was broken 27 * pointers passed where wrong and the 28 * socket was looked up backwards. Nobody 29 * tested any icmp error code obviously. 30 * Alan Cox : tcp_err() now handled properly. It wakes people 31 * on errors. select behaves and the icmp error race 32 * has gone by moving it into sock.c 33 * Alan Cox : tcp_reset() fixed to work for everything not just 34 * packets for unknown sockets. 35 * Alan Cox : tcp option processing. 36 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] 37 * Herp Rosmanith : More reset fixes 38 * Alan Cox : No longer acks invalid rst frames. Acking 39 * any kind of RST is right out. 40 * Alan Cox : Sets an ignore me flag on an rst receive 41 * otherwise odd bits of prattle escape still 42 * Alan Cox : Fixed another acking RST frame bug. Should stop 43 * LAN workplace lockups. 44 * Alan Cox : Some tidyups using the new skb list facilities 45 * Alan Cox : sk->keepopen now seems to work 46 * Alan Cox : Pulls options out correctly on accepts 47 * Alan Cox : Fixed assorted sk->rqueue->next errors 48 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. 49 * Alan Cox : Tidied tcp_data to avoid a potential nasty. 50 * Alan Cox : Added some better commenting, as the tcp is hard to follow 51 * Alan Cox : Removed incorrect check for 20 * psh 52 * Michael O'Reilly : ack < copied bug fix. 53 * Johannes Stille : Misc tcp fixes (not all in yet). 54 * Alan Cox : FIN with no memory -> CRASH 55 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. 56 * Alan Cox : Added TCP options (SOL_TCP) 57 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. 58 * Alan Cox : Use ip_tos/ip_ttl settings. 59 * Alan Cox : Handle FIN (more) properly (we hope). 60 * Alan Cox : RST frames sent on unsynchronised state ack error/ 61 * Alan Cox : Put in missing check for SYN bit. 62 * Alan Cox : Added tcp_select_window() aka NET2E 63 * window non shrink trick. 64 * Alan Cox : Added a couple of small NET2E timer fixes 65 * Charles Hedrick : TCP fixes 66 * Toomas Tamm : TCP window fixes 67 * Alan Cox : Small URG fix to rlogin ^C ack fight 68 * Charles Hedrick : Rewrote most of it to actually work 69 * Linus : Rewrote tcp_read() and URG handling 70 * completely 71 * Gerhard Koerting: Fixed some missing timer handling 72 * Matthew Dillon : Reworked TCP machine states as per RFC 73 * Gerhard Koerting: PC/TCP workarounds 74 * Adam Caldwell : Assorted timer/timing errors 75 * Matthew Dillon : Fixed another RST bug 76 * Alan Cox : Move to kernel side addressing changes. 77 * Alan Cox : Beginning work on TCP fastpathing (not yet usable) 78 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 79 * Alan Cox : TCP fast path debugging 80 * Alan Cox : Window clamping 81 * Michael Riepe : Bug in tcp_check() 82 * Matt Dillon : More TCP improvements and RST bug fixes 83 * Matt Dillon : Yet more small nasties remove from the TCP code 84 * (Be very nice to this man if tcp finally works 100%) 8) 85 * Alan Cox : BSD accept semantics. 86 * Alan Cox : Reset on closedown bug. 87 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 88 * Michael Pall : Handle select() after URG properly in all cases. 89 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). 90 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. 91 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. 92 * Alan Cox : Changed the semantics of sk->socket to 93 * fix a race and a signal problem with 94 * accept() and async I/O. 95 * Alan Cox : Relaxed the rules on tcp_sendto(). 96 * Yury Shevchuk : Really fixed accept() blocking problem. 97 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 98 * clients/servers which listen in on 99 * fixed ports. 100 * Alan Cox : Cleaned the above up and shrank it to 101 * a sensible code size. 102 * Alan Cox : Self connect lockup fix. 103 * Alan Cox : No connect to multicast. 104 * Ross Biro : Close unaccepted children on master 105 * socket close. 106 * Alan Cox : Reset tracing code. 107 * Alan Cox : Spurious resets on shutdown. 108 * Alan Cox : Giant 15 minute/60 second timer error 109 * Alan Cox : Small whoops in selecting before an accept. 110 * Alan Cox : Kept the state trace facility since its 111 * handy for debugging. 112 * Alan Cox : More reset handler fixes. 113 * Alan Cox : Started rewriting the code based on the RFC's 114 * for other useful protocol references see: 115 * Comer, KA9Q NOS, and for a reference on the 116 * difference between specifications and how BSD 117 * works see the 4.4lite source. 118 * A.N.Kuznetsov : Don't time wait on completion of tidy 119 * close. 120 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 121 * Linus Torvalds : Fixed BSD port reuse to work first syn 122 * Alan Cox : Reimplemented timers as per the RFC and using multiple 123 * timers for sanity. 124 * Alan Cox : Small bug fixes, and a lot of new 125 * comments. 126 * Alan Cox : Fixed dual reader crash by locking 127 * the buffers (much like datagram.c) 128 * Alan Cox : Fixed stuck sockets in probe. A probe 129 * now gets fed up of retrying without 130 * (even a no space) answer. 131 * Alan Cox : Extracted closing code better 132 * Alan Cox : Fixed the closing state machine to 133 * resemble the RFC. 134 * 135 * 136 * To Fix: 137 * Fast path the code. Two things here - fix the window calculation 138 * so it doesn't iterate over the queue, also spot packets with no funny 139 * options arriving in order and process directly. 140 * 141 * Implement RFC 1191 [Path MTU discovery] 142 * Look at the effect of implementing RFC 1337 suggestions and their impact. 143 * Rewrite output state machine to use a single queue and do low window 144 * situations as per the spec (RFC 1122) 145 * Speed up input assembly algorithm. 146 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 147 * could do with it working on IPv4 148 * User settable/learned rtt/max window/mtu 149 * Cope with MTU/device switches when retransmitting in tcp. 150 * Fix the window handling to use PR's new code. 151 * 152 * Change the fundamental structure to a single send queue maintained 153 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 154 * active routes too]). Cut the queue off in tcp_retransmit/ 155 * tcp_transmit. 156 * Change the receive queue to assemble as it goes. This lets us 157 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 158 * tcp_data/tcp_read as well as the window shrink crud. 159 * Seperate out duplicated code - tcp_alloc_skb, tcp_build_ack 160 * tcp_queue_skb seem obvious routines to extract. 161 * 162 * This program is free software; you can redistribute it and/or 163 * modify it under the terms of the GNU General Public License 164 * as published by the Free Software Foundation; either version 165 * 2 of the License, or(at your option) any later version. 166 * 167 * Description of States: 168 * 169 * TCP_SYN_SENT sent a connection request, waiting for ack 170 * 171 * TCP_SYN_RECV received a connection request, sent ack, 172 * waiting for final ack in three-way handshake. 173 * 174 * TCP_ESTABLISHED connection established 175 * 176 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 177 * transmission of remaining buffered data 178 * 179 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 180 * to shutdown 181 * 182 * TCP_CLOSING both sides have shutdown but we still have 183 * data we have to finish sending 184 * 185 * TCP_TIME_WAIT timeout to catch resent junk before entering 186 * closed, can only be entered from FIN_WAIT2 187 * or CLOSING. Required because the other end 188 * may not have gotten our last ACK causing it 189 * to retransmit the data packet (which we ignore) 190 * 191 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 192 * us to finish writing our data and to shutdown 193 * (we have to close() to move on to LAST_ACK) 194 * 195 * TCP_LAST_ACK out side has shutdown after remote has 196 * shutdown. There may still be data in our 197 * buffer that we have to finish sending 198 * 199 * TCP_CLOSE socket is finished 200 */ 201
202 #include <linux/types.h>
203 #include <linux/sched.h>
204 #include <linux/mm.h>
205 #include <linux/string.h>
206 #include <linux/config.h>
207 #include <linux/socket.h>
208 #include <linux/sockios.h>
209 #include <linux/termios.h>
210 #include <linux/in.h>
211 #include <linux/fcntl.h>
212 #include <linux/inet.h>
213 #include <linux/netdevice.h>
214 #include "snmp.h"
215 #include "ip.h"
216 #include "protocol.h"
217 #include "icmp.h"
218 #include "tcp.h"
219 #include "arp.h"
220 #include <linux/skbuff.h>
221 #include "sock.h"
222 #include "route.h"
223 #include <linux/errno.h>
224 #include <linux/timer.h>
225 #include <asm/system.h>
226 #include <asm/segment.h>
227 #include <linux/mm.h>
228
229 /* 230 * The MSL timer is the 'normal' timer. 231 */ 232
233 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
234
235 #defineSEQ_TICK 3
236 unsignedlongseq_offset;
237 structtcp_mibtcp_statistics;
238
239 staticvoidtcp_close(structsock *sk, inttimeout);
240
241
242 /* 243 * The less said about this the better, but it works and will do for 1.2 244 */ 245
246 staticstructwait_queue *master_select_wakeup;
247
248 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 249 { 250 if (a < b)
251 return(a);
252 return(b);
253 } 254
255 #undefSTATE_TRACE 256
257 #ifdefSTATE_TRACE 258 staticchar *statename[]={ 259 "Unused","Established","Syn Sent","Syn Recv",
260 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
261 "Close Wait","Last ACK","Listen","Closing"
262 };
263 #endif 264
265 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 266 { 267 if(sk->state==TCP_ESTABLISHED)
268 tcp_statistics.TcpCurrEstab--;
269 #ifdefSTATE_TRACE 270 if(sk->debug)
271 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
272 #endif 273 /* This is a hack but it doesn't occur often and its going to 274 be a real to fix nicely */ 275
276 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
277 { 278 wake_up_interruptible(&master_select_wakeup);
279 } 280 sk->state=state;
281 if(state==TCP_ESTABLISHED)
282 tcp_statistics.TcpCurrEstab++;
283 } 284
285 /* 286 * This routine picks a TCP windows for a socket based on 287 * the following constraints 288 * 289 * 1. The window can never be shrunk once it is offered (RFC 793) 290 * 2. We limit memory per socket 291 * 292 * For now we use NET2E3's heuristic of offering half the memory 293 * we have handy. All is not as bad as this seems however because 294 * of two things. Firstly we will bin packets even within the window 295 * in order to get the data we are waiting for into the memory limit. 296 * Secondly we bin common duplicate forms at receive time 297 * Better heuristics welcome 298 */ 299
300 inttcp_select_window(structsock *sk)
/* */ 301 { 302 intnew_window = sk->prot->rspace(sk);
303
304 if(sk->window_clamp)
305 new_window=min(sk->window_clamp,new_window);
306 /* 307 * Two things are going on here. First, we don't ever offer a 308 * window less than min(sk->mss, MAX_WINDOW/2). This is the 309 * receiver side of SWS as specified in RFC1122. 310 * Second, we always give them at least the window they 311 * had before, in order to avoid retracting window. This 312 * is technically allowed, but RFC1122 advises against it and 313 * in practice it causes trouble. 314 * 315 * Fixme: This doesn't correctly handle the case where 316 * new_window > sk->window but not by enough to allow for the 317 * shift in sequence space. 318 */ 319 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
320 return(sk->window);
321 return(new_window);
322 } 323
324 /* 325 * Find someone to 'accept'. Must be called with 326 * sk->inuse=1 or cli() 327 */ 328
329 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 330 { 331 structsk_buff *p=skb_peek(&s->receive_queue);
332 if(p==NULL)
333 returnNULL;
334 do 335 { 336 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
337 returnp;
338 p=p->next;
339 } 340 while(p!=(structsk_buff *)&s->receive_queue);
341 returnNULL;
342 } 343
344 /* 345 * Remove a completed connection and return it. This is used by 346 * tcp_accept() to get connections from the queue. 347 */ 348
349 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 350 { 351 structsk_buff *skb;
352 unsignedlongflags;
353 save_flags(flags);
354 cli();
355 skb=tcp_find_established(s);
356 if(skb!=NULL)
357 skb_unlink(skb); /* Take it off the queue */ 358 restore_flags(flags);
359 returnskb;
360 } 361
362 /* 363 * This routine closes sockets which have been at least partially 364 * opened, but not yet accepted. Currently it is only called by 365 * tcp_close, and timeout mirrors the value there. 366 */ 367
368 staticvoidtcp_close_pending (structsock *sk, inttimeout)
/* */ 369 { 370 structsk_buff *skb;
371
372 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { 373 tcp_close(skb->sk, timeout);
374 kfree_skb(skb, FREE_READ);
375 } 376 return;
377 } 378
379 /* 380 * Enter the time wait state. 381 */ 382
383 staticvoidtcp_time_wait(structsock *sk)
/* */ 384 { 385 tcp_set_state(sk,TCP_TIME_WAIT);
386 sk->shutdown = SHUTDOWN_MASK;
387 if (!sk->dead)
388 sk->state_change(sk);
389 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
390 } 391
392 /* 393 * A socket has timed out on its send queue and wants to do a 394 * little retransmitting. Currently this means TCP. 395 */ 396
397 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 398 { 399 structsk_buff * skb;
400 structproto *prot;
401 structdevice *dev;
402
403 prot = sk->prot;
404 skb = sk->send_head;
405
406 while (skb != NULL)
407 { 408 structtcphdr *th;
409 structiphdr *iph;
410 intsize;
411
412 dev = skb->dev;
413 IS_SKB(skb);
414 skb->when = jiffies;
415
416 /* 417 * In general it's OK just to use the old packet. However we 418 * need to use the current ack and window fields. Urg and 419 * urg_ptr could possibly stand to be updated as well, but we 420 * don't keep the necessary data. That shouldn't be a problem, 421 * if the other end is doing the right thing. Since we're 422 * changing the packet, we have to issue a new IP identifier. 423 */ 424
425 iph = (structiphdr *)(skb->data + dev->hard_header_len);
426 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
427 size = skb->len - (((unsignedchar *) th) - skb->data);
428
429 /* 430 * Note: We ought to check for window limits here but 431 * currently this is done (less efficiently) elsewhere. 432 * We do need to check for a route change but can't handle 433 * that until we have the new 1.3.x buffers in. 434 * 435 */ 436
437 iph->id = htons(ip_id_count++);
438 ip_send_check(iph);
439
440 /* 441 * This is not the right way to handle this. We have to 442 * issue an up to date window and ack report with this 443 * retransmit to keep the odd buggy tcp that relies on 444 * the fact BSD does this happy. 445 * We don't however need to recalculate the entire 446 * checksum, so someone wanting a small problem to play 447 * with might like to implement RFC1141/RFC1624 and speed 448 * this up by avoiding a full checksum. 449 */ 450
451 th->ack_seq = ntohl(sk->acked_seq);
452 th->window = ntohs(tcp_select_window(sk));
453 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
454
455 /* 456 * If the interface is (still) up and running, kick it. 457 */ 458
459 if (dev->flags & IFF_UP)
460 { 461 /* 462 * If the packet is still being sent by the device/protocol 463 * below then don't retransmit. This is both needed, and good - 464 * especially with connected mode AX.25 where it stops resends 465 * occurring of an as yet unsent anyway frame! 466 * We still add up the counts as the round trip time wants 467 * adjusting. 468 */ 469 if (sk && !skb_device_locked(skb))
470 { 471 /* Remove it from any existing driver queue first! */ 472 skb_unlink(skb);
473 /* Now queue it */ 474 ip_statistics.IpOutRequests++;
475 dev_queue_xmit(skb, dev, sk->priority);
476 } 477 } 478
479 /* 480 * Count retransmissions 481 */ 482
483 sk->retransmits++;
484 sk->prot->retransmits ++;
485
486 /* 487 * Only one retransmit requested. 488 */ 489
490 if (!all)
491 break;
492
493 /* 494 * This should cut it off before we send too many packets. 495 */ 496
497 if (sk->retransmits >= sk->cong_window)
498 break;
499 skb = skb->link3;
500 } 501 } 502
503 /* 504 * Reset the retransmission timer 505 */ 506
507 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 508 { 509 del_timer(&sk->retransmit_timer);
510 sk->ip_xmit_timeout = why;
511 if((int)when < 0)
512 { 513 when=3;
514 printk("Error: Negative timer in xmit_timer\n");
515 } 516 sk->retransmit_timer.expires=when;
517 add_timer(&sk->retransmit_timer);
518 } 519
520 /* 521 * This is the normal code called for timeouts. It does the retransmission 522 * and then does backoff. tcp_do_retransmit is separated out because 523 * tcp_ack needs to send stuff from the retransmit queue without 524 * initiating a backoff. 525 */ 526
527
528 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 529 { 530 tcp_do_retransmit(sk, all);
531
532 /* 533 * Increase the timeout each time we retransmit. Note that 534 * we do not increase the rtt estimate. rto is initialized 535 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 536 * that doubling rto each time is the least we can get away with. 537 * In KA9Q, Karn uses this for the first few times, and then 538 * goes to quadratic. netBSD doubles, but only goes up to *64, 539 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 540 * defined in the protocol as the maximum possible RTT. I guess 541 * we'll have to use something other than TCP to talk to the 542 * University of Mars. 543 * 544 * PAWS allows us longer timeouts and large windows, so once 545 * implemented ftp to mars will work nicely. We will have to fix 546 * the 120 second clamps though! 547 */ 548
549 sk->retransmits++;
550 sk->backoff++;
551 sk->rto = min(sk->rto << 1, 120*HZ);
552 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
553 } 554
555
556 /* 557 * A timer event has trigger a tcp retransmit timeout. The 558 * socket xmit queue is ready and set up to send. Because 559 * the ack receive code keeps the queue straight we do 560 * nothing clever here. 561 */ 562
563 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 564 { 565 if (all)
566 { 567 tcp_retransmit_time(sk, all);
568 return;
569 } 570
571 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 572 /* sk->ssthresh in theory can be zero. I guess that's OK */ 573 sk->cong_count = 0;
574
575 sk->cong_window = 1;
576
577 /* Do the actual retransmit. */ 578 tcp_retransmit_time(sk, all);
579 } 580
581 /* 582 * A write timeout has occured. Process the after effects. 583 */ 584
585 staticinttcp_write_timeout(structsock *sk)
/* */ 586 { 587 /* 588 * Look for a 'soft' timeout. 589 */ 590 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
591 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
592 { 593 /* 594 * Attempt to recover if arp has changed (unlikely!) or 595 * a route has shifted (not supported prior to 1.3). 596 */ 597 arp_destroy (sk->daddr, 0);
598 ip_route_check (sk->daddr);
599 } 600 /* 601 * Has it gone just too far ? 602 */ 603 if (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR2)
604 { 605 sk->err = ETIMEDOUT;
606 /* 607 * Time wait the socket 608 */ 609 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING)
610 { 611 tcp_set_state(sk,TCP_TIME_WAIT);
612 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
613 } 614 else 615 { 616 /* 617 * Clean up time. 618 */ 619 sk->prot->close (sk, 1);
620 return 0;
621 } 622 } 623 return 1;
624 } 625
626 /* 627 * The TCP retransmit timer. This lacks a few small details. 628 * 629 * 1. An initial rtt timeout on the probe0 should cause what we can 630 * of the first write queue buffer to be split and sent. 631 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 632 * ETIMEDOUT if we know an additional 'soft' error caused this. 633 * tcp_err should save a 'soft error' for us. 634 */ 635
636 staticvoidretransmit_timer(unsignedlongdata)
/* */ 637 { 638 structsock *sk = (structsock*)data;
639 intwhy = sk->ip_xmit_timeout;
640
641 /* 642 * only process if socket is not in use 643 */ 644
645 cli();
646 if (sk->inuse || in_bh)
647 { 648 /* Try again in 1 second */ 649 sk->retransmit_timer.expires = HZ;
650 add_timer(&sk->retransmit_timer);
651 sti();
652 return;
653 } 654
655 sk->inuse = 1;
656 sti();
657
658 /* Always see if we need to send an ack. */ 659
660 if (sk->ack_backlog && !sk->zapped)
661 { 662 sk->prot->read_wakeup (sk);
663 if (! sk->dead)
664 sk->data_ready(sk,0);
665 } 666
667 /* Now we need to figure out why the socket was on the timer. */ 668
669 switch (why)
670 { 671 /* Window probing */ 672 caseTIME_PROBE0:
673 tcp_send_probe0(sk);
674 if(tcp_write_timeout(sk))
675 release_sock (sk);
676 break;
677 /* Retransmitting */ 678 caseTIME_WRITE:
679 /* It could be we got here because we needed to send an ack. 680 * So we need to check for that. 681 */ 682 { 683 structsk_buff *skb;
684 unsignedlongflags;
685
686 save_flags(flags);
687 cli();
688 skb = sk->send_head;
689 if (!skb)
690 { 691 restore_flags(flags);
692 } 693 else 694 { 695 /* 696 * Kicked by a delayed ack. Reset timer 697 * correctly now 698 */ 699 if (jiffies < skb->when + sk->rto)
700 { 701 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
702 restore_flags(flags);
703 release_sock (sk);
704 break;
705 } 706 restore_flags(flags);
707 /* 708 * Retransmission 709 */ 710 sk->prot->retransmit (sk, 0);
711 if(!tcp_write_timeout(sk))
712 break;
713 } 714 release_sock (sk);
715 break;
716 } 717 /* Sending Keepalives */ 718 caseTIME_KEEPOPEN:
719 /* 720 * this reset_timer() call is a hack, this is not 721 * how KEEPOPEN is supposed to work. 722 */ 723 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
724
725 /* Send something to keep the connection open. */ 726 if (sk->prot->write_wakeup)
727 sk->prot->write_wakeup (sk);
728 sk->retransmits++;
729 if(tcp_write_timeout(sk))
730 release_sock (sk);
731 break;
732 default:
733 printk ("rexmit_timer: timer expired - reason unknown\n");
734 release_sock (sk);
735 break;
736 } 737 } 738
739 /* 740 * This routine is called by the ICMP module when it gets some 741 * sort of error condition. If err < 0 then the socket should 742 * be closed and the error returned to the user. If err > 0 743 * it's just the icmp type << 8 | icmp code. After adjustment 744 * header points to the first 8 bytes of the tcp header. We need 745 * to find the appropriate port. 746 */ 747
748 voidtcp_err(interr, unsignedchar *header, unsignedlongdaddr,
/* */ 749 unsignedlongsaddr, structinet_protocol *protocol)
750 { 751 structtcphdr *th;
752 structsock *sk;
753 structiphdr *iph=(structiphdr *)header;
754
755 header+=4*iph->ihl;
756
757
758 th =(structtcphdr *)header;
759 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
760
761 if (sk == NULL)
762 return;
763
764 if(err<0)
765 { 766 sk->err = -err;
767 sk->error_report(sk);
768 return;
769 } 770
771 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
772 { 773 /* 774 * FIXME: 775 * For now we will just trigger a linear backoff. 776 * The slow start code should cause a real backoff here. 777 */ 778 if (sk->cong_window > 4)
779 sk->cong_window--;
780 return;
781 } 782
783 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ 784
785 /* 786 * If we've already connected we will keep trying 787 * until we time out, or the user gives up. 788 */ 789
790 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
791 { 792 if (sk->state == TCP_SYN_SENT)
793 { 794 tcp_statistics.TcpAttemptFails++;
795 tcp_set_state(sk,TCP_CLOSE);
796 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 797 } 798 sk->err = icmp_err_convert[err & 0xff].errno;
799 } 800 return;
801 } 802
803
804 /* 805 * Walk down the receive queue counting readable data until we hit the end or we find a gap 806 * in the received data queue (ie a frame missing that needs sending to us). Not 807 * sorting using two queues as data arrives makes life so much harder. 808 */ 809
810 staticinttcp_readable(structsock *sk)
/* */ 811 { 812 unsignedlongcounted;
813 unsignedlongamount;
814 structsk_buff *skb;
815 intsum;
816 unsignedlongflags;
817
818 if(sk && sk->debug)
819 printk("tcp_readable: %p - ",sk);
820
821 save_flags(flags);
822 cli();
823 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
824 { 825 restore_flags(flags);
826 if(sk && sk->debug)
827 printk("empty\n");
828 return(0);
829 } 830
831 counted = sk->copied_seq; /* Where we are at the moment */ 832 amount = 0;
833
834 /* 835 * Do until a push or until we are out of data. 836 */ 837
838 do 839 { 840 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ 841 break;
842 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 843 if (skb->h.th->syn)
844 sum++;
845 if (sum > 0)
846 {/* Add it up, move on */ 847 amount += sum;
848 if (skb->h.th->syn)
849 amount--;
850 counted += sum;
851 } 852 /* 853 * Don't count urg data ... but do it in the right place! 854 * Consider: "old_data (ptr is here) URG PUSH data" 855 * The old code would stop at the first push because 856 * it counted the urg (amount==1) and then does amount-- 857 * *after* the loop. This means tcp_readable() always 858 * returned zero if any URG PUSH was in the queue, even 859 * though there was normal data available. If we subtract 860 * the urg data right here, we even get it to work for more 861 * than one URG PUSH skb without normal data. 862 * This means that select() finally works now with urg data 863 * in the queue. Note that rlogin was never affected 864 * because it doesn't use select(); it uses two processes 865 * and a blocking read(). And the queue scan in tcp_read() 866 * was correct. Mike <pall@rz.uni-karlsruhe.de> 867 */ 868 if (skb->h.th->urg)
869 amount--; /* don't count urg data */ 870 if (amount && skb->h.th->psh) break;
871 skb = skb->next;
872 } 873 while(skb != (structsk_buff *)&sk->receive_queue);
874
875 restore_flags(flags);
876 if(sk->debug)
877 printk("got %lu bytes.\n",amount);
878 return(amount);
879 } 880
881 /* 882 * LISTEN is a special case for select.. 883 */ 884 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 885 { 886 if (sel_type == SEL_IN) { 887 intretval;
888
889 sk->inuse = 1;
890 retval = (tcp_find_established(sk) != NULL);
891 release_sock(sk);
892 if (!retval)
893 select_wait(&master_select_wakeup,wait);
894 returnretval;
895 } 896 return 0;
897 } 898
899
900 /* 901 * Wait for a TCP event. 902 * 903 * Note that we don't need to set "sk->inuse", as the upper select layers 904 * take care of normal races (between the test and the event) and we don't 905 * go look at any of the socket buffers directly. 906 */ 907 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 908 { 909 if (sk->state == TCP_LISTEN)
910 returntcp_listen_select(sk, sel_type, wait);
911
912 switch(sel_type) { 913 caseSEL_IN:
914 if (sk->err)
915 return 1;
916 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
917 break;
918
919 if (sk->shutdown & RCV_SHUTDOWN)
920 return 1;
921
922 if (sk->acked_seq == sk->copied_seq)
923 break;
924
925 if (sk->urg_seq != sk->copied_seq ||
926 sk->acked_seq != sk->copied_seq+1 ||
927 sk->urginline || !sk->urg_data)
928 return 1;
929 break;
930
931 caseSEL_OUT:
932 if (sk->shutdown & SEND_SHUTDOWN)
933 return 0;
934 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
935 break;
936 /* 937 * This is now right thanks to a small fix 938 * by Matt Dillon. 939 */ 940
941 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
942 break;
943 return 1;
944
945 caseSEL_EX:
946 if (sk->err || sk->urg_data)
947 return 1;
948 break;
949 } 950 select_wait(sk->sleep, wait);
951 return 0;
952 } 953
954 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 955 { 956 interr;
957 switch(cmd)
958 { 959
960 caseTIOCINQ:
961 #ifdef FIXME /* FIXME: */ 962 caseFIONREAD:
963 #endif 964 { 965 unsignedlongamount;
966
967 if (sk->state == TCP_LISTEN)
968 return(-EINVAL);
969
970 sk->inuse = 1;
971 amount = tcp_readable(sk);
972 release_sock(sk);
973 err=verify_area(VERIFY_WRITE,(void *)arg,
974 sizeof(unsignedlong));
975 if(err)
976 returnerr;
977 put_fs_long(amount,(unsignedlong *)arg);
978 return(0);
979 } 980 caseSIOCATMARK:
981 { 982 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
983
984 err = verify_area(VERIFY_WRITE,(void *) arg,
985 sizeof(unsignedlong));
986 if (err)
987 returnerr;
988 put_fs_long(answ,(int *) arg);
989 return(0);
990 } 991 caseTIOCOUTQ:
992 { 993 unsignedlongamount;
994
995 if (sk->state == TCP_LISTEN) return(-EINVAL);
996 amount = sk->prot->wspace(sk);
997 err=verify_area(VERIFY_WRITE,(void *)arg,
998 sizeof(unsignedlong));
999 if(err)
1000 returnerr;
1001 put_fs_long(amount,(unsignedlong *)arg);
1002 return(0);
1003 }1004 default:
1005 return(-EINVAL);
1006 }1007 }1008
1009
1010 /*1011 * This routine computes a TCP checksum. 1012 */1013
1014 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1015 unsignedlongsaddr, unsignedlongdaddr)
1016 {1017 unsignedlongsum;
1018
1019 if (saddr == 0) saddr = ip_my_addr();
1020
1021 /*1022 * stupid, gcc complains when I use just one __asm__ block,1023 * something about too many reloads, but this is just two1024 * instructions longer than what I want1025 */1026 __asm__("
1027 addl %%ecx, %%ebx
1028 adcl %%edx, %%ebx
1029 adcl $0, %%ebx
1030 "
1031 : "=b"(sum)
1032 : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1033 : "bx", "cx", "dx" );
1034 __asm__("
1035 movl %%ecx, %%edx
1036 cld
1037 cmpl $32, %%ecx
1038 jb 2f
1039 shrl $5, %%ecx
1040 clc
1041 1: lodsl
1042 adcl %%eax, %%ebx
1043 lodsl
1044 adcl %%eax, %%ebx
1045 lodsl
1046 adcl %%eax, %%ebx
1047 lodsl
1048 adcl %%eax, %%ebx
1049 lodsl
1050 adcl %%eax, %%ebx
1051 lodsl
1052 adcl %%eax, %%ebx
1053 lodsl
1054 adcl %%eax, %%ebx
1055 lodsl
1056 adcl %%eax, %%ebx
1057 loop 1b
1058 adcl $0, %%ebx
1059 movl %%edx, %%ecx
1060 2: andl $28, %%ecx
1061 je 4f
1062 shrl $2, %%ecx
1063 clc
1064 3: lodsl
1065 adcl %%eax, %%ebx
1066 loop 3b
1067 adcl $0, %%ebx
1068 4: movl $0, %%eax
1069 testw $2, %%dx
1070 je 5f
1071 lodsw
1072 addl %%eax, %%ebx
1073 adcl $0, %%ebx
1074 movw $0, %%ax
1075 5: test $1, %%edx
1076 je 6f
1077 lodsb
1078 addl %%eax, %%ebx
1079 adcl $0, %%ebx
1080 6: movl %%ebx, %%eax
1081 shrl $16, %%eax
1082 addw %%ax, %%bx
1083 adcw $0, %%bx
1084 "
1085 : "=b"(sum)
1086 : "0"(sum), "c"(len), "S"(th)
1087 : "ax", "bx", "cx", "dx", "si" );
1088
1089 /* We only want the bottom 16 bits, but we never cleared the top 16. */1090
1091 return((~sum) & 0xffff);
1092 }1093
1094
1095
1096 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1097 unsignedlongdaddr, intlen, structsock *sk)
1098 {1099 th->check = 0;
1100 th->check = tcp_check(th, len, saddr, daddr);
1101 return;
1102 }1103
1104 /*1105 * This is the main buffer sending routine. We queue the buffer1106 * having checked it is sane seeming.1107 */1108
1109 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1110 {1111 intsize;
1112 structtcphdr * th = skb->h.th;
1113
1114 /*1115 * length of packet (not counting length of pre-tcp headers) 1116 */1117
1118 size = skb->len - ((unsignedchar *) th - skb->data);
1119
1120 /*1121 * Sanity check it.. 1122 */1123
1124 if (size < sizeof(structtcphdr) || size > skb->len)
1125 {1126 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1127 skb, skb->data, th, skb->len);
1128 kfree_skb(skb, FREE_WRITE);
1129 return;
1130 }1131
1132 /*1133 * If we have queued a header size packet.. (these crash a few1134 * tcp stacks if ack is not set)1135 */1136
1137 if (size == sizeof(structtcphdr))
1138 {1139 /* If its got a syn or fin its notionally included in the size..*/1140 if(!th->syn && !th->fin)
1141 {1142 printk("tcp_send_skb: attempt to queue a bogon.\n");
1143 kfree_skb(skb,FREE_WRITE);
1144 return;
1145 }1146 }1147
1148 /*1149 * Actual processing.1150 */1151
1152 tcp_statistics.TcpOutSegs++;
1153 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1154
1155 /*1156 * We must queue if1157 *1158 * a) The right edge of this frame exceeds the window1159 * b) We are retransmitting (Nagle's rule)1160 * c) We have too many packets 'in flight'1161 */1162
1163 if (after(skb->h.seq, sk->window_seq) ||
1164 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1165 sk->packets_out >= sk->cong_window)
1166 {1167 /* checksum will be supplied by tcp_write_xmit. So1168 * we shouldn't need to set it at all. I'm being paranoid */1169 th->check = 0;
1170 if (skb->next != NULL)
1171 {1172 printk("tcp_send_partial: next != NULL\n");
1173 skb_unlink(skb);
1174 }1175 skb_queue_tail(&sk->write_queue, skb);
1176
1177 /*1178 * If we don't fit we have to start the zero window1179 * probes. This is broken - we really need to do a partial1180 * send _first_ (This is what causes the Cisco and PC/TCP1181 * grief).1182 */1183
1184 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1185 sk->send_head == NULL && sk->ack_backlog == 0)
1186 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1187 }1188 else1189 {1190 /*1191 * This is going straight out1192 */1193
1194 th->ack_seq = ntohl(sk->acked_seq);
1195 th->window = ntohs(tcp_select_window(sk));
1196
1197 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1198
1199 sk->sent_seq = sk->write_seq;
1200
1201 /*1202 * This is mad. The tcp retransmit queue is put together1203 * by the ip layer. This causes half the problems with1204 * unroutable FIN's and other things.1205 */1206
1207 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1208
1209 /*1210 * Set for next retransmit based on expected ACK time.1211 * FIXME: We set this every time which means our 1212 * retransmits are really about a window behind.1213 */1214
1215 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1216 }1217 }1218
1219 /*1220 * Locking problems lead us to a messy situation where we can have1221 * multiple partially complete buffers queued up. This is really bad1222 * as we don't want to be sending partial buffers. Fix this with1223 * a semaphore or similar to lock tcp_write per socket.1224 *1225 * These routines are pretty self descriptive.1226 */1227
1228 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1229 {1230 structsk_buff * skb;
1231 unsignedlongflags;
1232
1233 save_flags(flags);
1234 cli();
1235 skb = sk->partial;
1236 if (skb) {1237 sk->partial = NULL;
1238 del_timer(&sk->partial_timer);
1239 }1240 restore_flags(flags);
1241 returnskb;
1242 }1243
1244 /*1245 * Empty the partial queue1246 */1247
1248 staticvoidtcp_send_partial(structsock *sk)
/* */1249 {1250 structsk_buff *skb;
1251
1252 if (sk == NULL)
1253 return;
1254 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1255 tcp_send_skb(sk, skb);
1256 }1257
1258 /*1259 * Queue a partial frame1260 */1261
1262 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1263 {1264 structsk_buff * tmp;
1265 unsignedlongflags;
1266
1267 save_flags(flags);
1268 cli();
1269 tmp = sk->partial;
1270 if (tmp)
1271 del_timer(&sk->partial_timer);
1272 sk->partial = skb;
1273 init_timer(&sk->partial_timer);
1274 /*1275 * Wait up to 1 second for the buffer to fill.1276 */1277 sk->partial_timer.expires = HZ;
1278 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1279 sk->partial_timer.data = (unsignedlong) sk;
1280 add_timer(&sk->partial_timer);
1281 restore_flags(flags);
1282 if (tmp)
1283 tcp_send_skb(sk, tmp);
1284 }1285
1286
1287 /*1288 * This routine sends an ack and also updates the window. 1289 */1290
1291 staticvoidtcp_send_ack(unsignedlongsequence, unsignedlongack,
/* */1292 structsock *sk,
1293 structtcphdr *th, unsignedlongdaddr)
1294 {1295 structsk_buff *buff;
1296 structtcphdr *t1;
1297 structdevice *dev = NULL;
1298 inttmp;
1299
1300 if(sk->zapped)
1301 return; /* We have been reset, we may not send again */1302
1303 /*1304 * We need to grab some memory, and put together an ack,1305 * and then put it into the queue to be sent.1306 */1307
1308 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1309 if (buff == NULL)
1310 {1311 /* 1312 * Force it to send an ack. We don't have to do this1313 * (ACK is unreliable) but its much better use of 1314 * bandwidth on slow links to send a spare ack than1315 * resend packets. 1316 */1317
1318 sk->ack_backlog++;
1319 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1320 {1321 reset_xmit_timer(sk, TIME_WRITE, HZ);
1322 }1323 return;
1324 }1325
1326 /*1327 * Assemble a suitable TCP frame1328 */1329
1330 buff->len = sizeof(structtcphdr);
1331 buff->sk = sk;
1332 buff->localroute = sk->localroute;
1333 t1 =(structtcphdr *) buff->data;
1334
1335 /* 1336 * Put in the IP header and routing stuff. 1337 */1338
1339 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1340 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1341 if (tmp < 0)
1342 {1343 buff->free = 1;
1344 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1345 return;
1346 }1347 buff->len += tmp;
1348 t1 =(structtcphdr *)((char *)t1 +tmp);
1349
1350 memcpy(t1, th, sizeof(*t1));
1351
1352 /*1353 * Swap the send and the receive. 1354 */1355
1356 t1->dest = th->source;
1357 t1->source = th->dest;
1358 t1->seq = ntohl(sequence);
1359 t1->ack = 1;
1360 sk->window = tcp_select_window(sk);
1361 t1->window = ntohs(sk->window);
1362 t1->res1 = 0;
1363 t1->res2 = 0;
1364 t1->rst = 0;
1365 t1->urg = 0;
1366 t1->syn = 0;
1367 t1->psh = 0;
1368 t1->fin = 0;
1369
1370 /*1371 * If we have nothing queued for transmit and the transmit timer1372 * is on we are just doing an ACK timeout and need to switch1373 * to a keepalive.1374 */1375
1376 if (ack == sk->acked_seq)
1377 {1378 sk->ack_backlog = 0;
1379 sk->bytes_rcv = 0;
1380 sk->ack_timed = 0;
1381 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1382 && sk->ip_xmit_timeout == TIME_WRITE)
1383 {1384 if(sk->keepopen) {1385 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1386 }else{1387 delete_timer(sk);
1388 }1389 }1390 }1391
1392 /*1393 * Fill in the packet and send it1394 */1395
1396 t1->ack_seq = ntohl(ack);
1397 t1->doff = sizeof(*t1)/4;
1398 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1399 if (sk->debug)
1400 printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1401 tcp_statistics.TcpOutSegs++;
1402 sk->prot->queue_xmit(sk, dev, buff, 1);
1403 }1404
1405
1406 /* 1407 * This routine builds a generic TCP header. 1408 */1409
1410 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1411 {1412
1413 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1414 th->seq = htonl(sk->write_seq);
1415 th->psh =(push == 0) ? 1 : 0;
1416 th->doff = sizeof(*th)/4;
1417 th->ack = 1;
1418 th->fin = 0;
1419 sk->ack_backlog = 0;
1420 sk->bytes_rcv = 0;
1421 sk->ack_timed = 0;
1422 th->ack_seq = htonl(sk->acked_seq);
1423 sk->window = tcp_select_window(sk);
1424 th->window = htons(sk->window);
1425
1426 return(sizeof(*th));
1427 }1428
1429 /*1430 * This routine copies from a user buffer into a socket,1431 * and starts the transmit system.1432 */1433
1434 staticinttcp_write(structsock *sk, unsignedchar *from,
/* */1435 intlen, intnonblock, unsignedflags)
1436 {1437 intcopied = 0;
1438 intcopy;
1439 inttmp;
1440 structsk_buff *skb;
1441 structsk_buff *send_tmp;
1442 unsignedchar *buff;
1443 structproto *prot;
1444 structdevice *dev = NULL;
1445
1446 sk->inuse=1;
1447 prot = sk->prot;
1448 while(len > 0)
1449 {1450 if (sk->err)
1451 {/* Stop on an error */1452 release_sock(sk);
1453 if (copied)
1454 return(copied);
1455 tmp = -sk->err;
1456 sk->err = 0;
1457 return(tmp);
1458 }1459
1460 /*1461 * First thing we do is make sure that we are established. 1462 */1463
1464 if (sk->shutdown & SEND_SHUTDOWN)
1465 {1466 release_sock(sk);
1467 sk->err = EPIPE;
1468 if (copied)
1469 return(copied);
1470 sk->err = 0;
1471 return(-EPIPE);
1472 }1473
1474 /* 1475 * Wait for a connection to finish.1476 */1477
1478 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1479 {1480 if (sk->err)
1481 {1482 release_sock(sk);
1483 if (copied)
1484 return(copied);
1485 tmp = -sk->err;
1486 sk->err = 0;
1487 return(tmp);
1488 }1489
1490 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1491 {1492 release_sock(sk);
1493 if (copied)
1494 return(copied);
1495
1496 if (sk->err)
1497 {1498 tmp = -sk->err;
1499 sk->err = 0;
1500 return(tmp);
1501 }1502
1503 if (sk->keepopen)
1504 {1505 send_sig(SIGPIPE, current, 0);
1506 }1507 return(-EPIPE);
1508 }1509
1510 if (nonblock || copied)
1511 {1512 release_sock(sk);
1513 if (copied)
1514 return(copied);
1515 return(-EAGAIN);
1516 }1517
1518 release_sock(sk);
1519 cli();
1520
1521 if (sk->state != TCP_ESTABLISHED &&
1522 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1523 {1524 interruptible_sleep_on(sk->sleep);
1525 if (current->signal & ~current->blocked)
1526 {1527 sti();
1528 if (copied)
1529 return(copied);
1530 return(-ERESTARTSYS);
1531 }1532 }1533 sk->inuse = 1;
1534 sti();
1535 }1536
1537 /*1538 * The following code can result in copy <= if sk->mss is ever1539 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1540 * sk->mtu is constant once SYN processing is finished. I.e. we1541 * had better not get here until we've seen his SYN and at least one1542 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1543 * But ESTABLISHED should guarantee that. sk->max_window is by definition1544 * non-decreasing. Note that any ioctl to set user_mss must be done1545 * before the exchange of SYN's. If the initial ack from the other1546 * end has a window of 0, max_window and thus mss will both be 0.1547 */1548
1549 /* 1550 * Now we need to check if we have a half built packet. 1551 */1552
1553 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1554 {1555 inthdrlen;
1556
1557 /* IP header + TCP header */1558 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1559 + sizeof(structtcphdr);
1560
1561 /* Add more stuff to the end of skb->len */1562 if (!(flags & MSG_OOB))
1563 {1564 copy = min(sk->mss - (skb->len - hdrlen), len);
1565 /* FIXME: this is really a bug. */1566 if (copy <= 0)
1567 {1568 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1569 copy = 0;
1570 }1571
1572 memcpy_fromfs(skb->data + skb->len, from, copy);
1573 skb->len += copy;
1574 from += copy;
1575 copied += copy;
1576 len -= copy;
1577 sk->write_seq += copy;
1578 }1579 if ((skb->len - hdrlen) >= sk->mss ||
1580 (flags & MSG_OOB) || !sk->packets_out)
1581 tcp_send_skb(sk, skb);
1582 else1583 tcp_enqueue_partial(skb, sk);
1584 continue;
1585 }1586
1587 /*1588 * We also need to worry about the window.1589 * If window < 1/2 the maximum window we've seen from this1590 * host, don't use it. This is sender side1591 * silly window prevention, as specified in RFC1122.1592 * (Note that this is different than earlier versions of1593 * SWS prevention, e.g. RFC813.). What we actually do is 1594 * use the whole MSS. Since the results in the right1595 * edge of the packet being outside the window, it will1596 * be queued for later rather than sent.1597 */1598
1599 copy = sk->window_seq - sk->write_seq;
1600 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1601 copy = sk->mss;
1602 if (copy > len)
1603 copy = len;
1604
1605 /*1606 * We should really check the window here also. 1607 */1608
1609 send_tmp = NULL;
1610 if (copy < sk->mss && !(flags & MSG_OOB))
1611 {1612 /*1613 * We will release the socket incase we sleep here. 1614 */1615 release_sock(sk);
1616 /*1617 * NB: following must be mtu, because mss can be increased.1618 * mss is always <= mtu 1619 */1620 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1621 sk->inuse = 1;
1622 send_tmp = skb;
1623 }1624 else1625 {1626 /*1627 * We will release the socket incase we sleep here. 1628 */1629 release_sock(sk);
1630 skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1631 sk->inuse = 1;
1632 }1633
1634 /*1635 * If we didn't get any memory, we need to sleep. 1636 */1637
1638 if (skb == NULL)
1639 {1640 sk->socket->flags |= SO_NOSPACE;
1641 if (nonblock)
1642 {1643 release_sock(sk);
1644 if (copied)
1645 return(copied);
1646 return(-EAGAIN);
1647 }1648
1649 /*1650 * FIXME: here is another race condition. 1651 */1652
1653 tmp = sk->wmem_alloc;
1654 release_sock(sk);
1655 cli();
1656 /*1657 * Again we will try to avoid it. 1658 */1659 if (tmp <= sk->wmem_alloc &&
1660 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1661 && sk->err == 0)
1662 {1663 sk->socket->flags &= ~SO_NOSPACE;
1664 interruptible_sleep_on(sk->sleep);
1665 if (current->signal & ~current->blocked)
1666 {1667 sti();
1668 if (copied)
1669 return(copied);
1670 return(-ERESTARTSYS);
1671 }1672 }1673 sk->inuse = 1;
1674 sti();
1675 continue;
1676 }1677
1678 skb->len = 0;
1679 skb->sk = sk;
1680 skb->free = 0;
1681 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1682
1683 buff = skb->data;
1684
1685 /*1686 * FIXME: we need to optimize this.1687 * Perhaps some hints here would be good.1688 */1689
1690 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1691 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1692 if (tmp < 0 )
1693 {1694 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1695 release_sock(sk);
1696 if (copied)
1697 return(copied);
1698 return(tmp);
1699 }1700 skb->len += tmp;
1701 skb->dev = dev;
1702 buff += tmp;
1703 skb->h.th =(structtcphdr *) buff;
1704 tmp = tcp_build_header((structtcphdr *)buff, sk, len-copy);
1705 if (tmp < 0)
1706 {1707 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1708 release_sock(sk);
1709 if (copied)
1710 return(copied);
1711 return(tmp);
1712 }1713
1714 if (flags & MSG_OOB)
1715 {1716 ((structtcphdr *)buff)->urg = 1;
1717 ((structtcphdr *)buff)->urg_ptr = ntohs(copy);
1718 }1719 skb->len += tmp;
1720 memcpy_fromfs(buff+tmp, from, copy);
1721
1722 from += copy;
1723 copied += copy;
1724 len -= copy;
1725 skb->len += copy;
1726 skb->free = 0;
1727 sk->write_seq += copy;
1728
1729 if (send_tmp != NULL && sk->packets_out)
1730 {1731 tcp_enqueue_partial(send_tmp, sk);
1732 continue;
1733 }1734 tcp_send_skb(sk, skb);
1735 }1736 sk->err = 0;
1737
1738 /*1739 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1740 * interactive fast network servers. It's meant to be on and1741 * it really improves the throughput though not the echo time1742 * on my slow slip link - Alan1743 */1744
1745 /*1746 * Avoid possible race on send_tmp - c/o Johannes Stille 1747 */1748
1749 if(sk->partial && ((!sk->packets_out)
1750 /* If not nagling we can send on the before case too.. */1751 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1752 ))
1753 tcp_send_partial(sk);
1754
1755 release_sock(sk);
1756 return(copied);
1757 }1758
1759 /*1760 * This is just a wrapper. 1761 */1762
1763 staticinttcp_sendto(structsock *sk, unsignedchar *from,
/* */1764 intlen, intnonblock, unsignedflags,
1765 structsockaddr_in *addr, intaddr_len)
1766 {1767 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1768 return -EINVAL;
1769 if (sk->state == TCP_CLOSE)
1770 return -ENOTCONN;
1771 if (addr_len < sizeof(*addr))
1772 return -EINVAL;
1773 if (addr->sin_family && addr->sin_family != AF_INET)
1774 return -EINVAL;
1775 if (addr->sin_port != sk->dummy_th.dest)
1776 return -EISCONN;
1777 if (addr->sin_addr.s_addr != sk->daddr)
1778 return -EISCONN;
1779 returntcp_write(sk, from, len, nonblock, flags);
1780 }1781
1782
1783 /*1784 * Send an ack if one is backlogged at this point. Ought to merge1785 * this with tcp_send_ack().1786 */1787
1788 staticvoidtcp_read_wakeup(structsock *sk)
/* */1789 {1790 inttmp;
1791 structdevice *dev = NULL;
1792 structtcphdr *t1;
1793 structsk_buff *buff;
1794
1795 if (!sk->ack_backlog)
1796 return;
1797
1798 /*1799 * FIXME: we need to put code here to prevent this routine from1800 * being called. Being called once in a while is ok, so only check1801 * if this is the second time in a row.1802 */1803
1804 /*1805 * We need to grab some memory, and put together an ack,1806 * and then put it into the queue to be sent.1807 */1808
1809 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1810 if (buff == NULL)
1811 {1812 /* Try again real soon. */1813 reset_xmit_timer(sk, TIME_WRITE, HZ);
1814 return;
1815 }1816
1817 buff->len = sizeof(structtcphdr);
1818 buff->sk = sk;
1819 buff->localroute = sk->localroute;
1820
1821 /*1822 * Put in the IP header and routing stuff. 1823 */1824
1825 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1826 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1827 if (tmp < 0)
1828 {1829 buff->free = 1;
1830 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1831 return;
1832 }1833
1834 buff->len += tmp;
1835 t1 =(structtcphdr *)(buff->data +tmp);
1836
1837 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1838 t1->seq = htonl(sk->sent_seq);
1839 t1->ack = 1;
1840 t1->res1 = 0;
1841 t1->res2 = 0;
1842 t1->rst = 0;
1843 t1->urg = 0;
1844 t1->syn = 0;
1845 t1->psh = 0;
1846 sk->ack_backlog = 0;
1847 sk->bytes_rcv = 0;
1848 sk->window = tcp_select_window(sk);
1849 t1->window = ntohs(sk->window);
1850 t1->ack_seq = ntohl(sk->acked_seq);
1851 t1->doff = sizeof(*t1)/4;
1852 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1853 sk->prot->queue_xmit(sk, dev, buff, 1);
1854 tcp_statistics.TcpOutSegs++;
1855 }1856
1857
1858 /*1859 * FIXME:1860 * This routine frees used buffers.1861 * It should consider sending an ACK to let the1862 * other end know we now have a bigger window.1863 */1864
1865 staticvoidcleanup_rbuf(structsock *sk)
/* */1866 {1867 unsignedlongflags;
1868 unsignedlongleft;
1869 structsk_buff *skb;
1870 unsignedlongrspace;
1871
1872 if(sk->debug)
1873 printk("cleaning rbuf for sk=%p\n", sk);
1874
1875 save_flags(flags);
1876 cli();
1877
1878 left = sk->prot->rspace(sk);
1879
1880 /*1881 * We have to loop through all the buffer headers,1882 * and try to free up all the space we can.1883 */1884
1885 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1886 {1887 if (!skb->used || skb->users)
1888 break;
1889 skb_unlink(skb);
1890 skb->sk = sk;
1891 kfree_skb(skb, FREE_READ);
1892 }1893
1894 restore_flags(flags);
1895
1896 /*1897 * FIXME:1898 * At this point we should send an ack if the difference1899 * in the window, and the amount of space is bigger than1900 * TCP_WINDOW_DIFF.1901 */1902
1903 if(sk->debug)
1904 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1905 left);
1906 if ((rspace=sk->prot->rspace(sk)) != left)
1907 {1908 /*1909 * This area has caused the most trouble. The current strategy1910 * is to simply do nothing if the other end has room to send at1911 * least 3 full packets, because the ack from those will auto-1912 * matically update the window. If the other end doesn't think1913 * we have much space left, but we have room for at least 1 more1914 * complete packet than it thinks we do, we will send an ack1915 * immediately. Otherwise we will wait up to .5 seconds in case1916 * the user reads some more.1917 */1918 sk->ack_backlog++;
1919 /*1920 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1921 * if the other end is offering a window smaller than the agreed on MSS1922 * (called sk->mtu here). In theory there's no connection between send1923 * and receive, and so no reason to think that they're going to send1924 * small packets. For the moment I'm using the hack of reducing the mss1925 * only on the send side, so I'm putting mtu here.1926 */1927
1928 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1929 {1930 /* Send an ack right now. */1931 tcp_read_wakeup(sk);
1932 }1933 else1934 {1935 /* Force it to send an ack soon. */1936 intwas_active = del_timer(&sk->retransmit_timer);
1937 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1938 {1939 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1940 }1941 else1942 add_timer(&sk->retransmit_timer);
1943 }1944 }1945 }1946
1947
1948 /*1949 * Handle reading urgent data. BSD has very simple semantics for1950 * this, no blocking and very strange errors 8)1951 */1952
1953 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */1954 unsignedchar *to, intlen, unsignedflags)
1955 {1956 /*1957 * No URG data to read1958 */1959 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1960 return -EINVAL; /* Yes this is right ! */1961
1962 if (sk->err)
1963 {1964 inttmp = -sk->err;
1965 sk->err = 0;
1966 returntmp;
1967 }1968
1969 if (sk->state == TCP_CLOSE || sk->done)
1970 {1971 if (!sk->done) {1972 sk->done = 1;
1973 return 0;
1974 }1975 return -ENOTCONN;
1976 }1977
1978 if (sk->shutdown & RCV_SHUTDOWN)
1979 {1980 sk->done = 1;
1981 return 0;
1982 }1983 sk->inuse = 1;
1984 if (sk->urg_data & URG_VALID)
1985 {1986 charc = sk->urg_data;
1987 if (!(flags & MSG_PEEK))
1988 sk->urg_data = URG_READ;
1989 put_fs_byte(c, to);
1990 release_sock(sk);
1991 return 1;
1992 }1993 release_sock(sk);
1994
1995 /*1996 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and1997 * the available implementations agree in this case:1998 * this call should never block, independent of the1999 * blocking state of the socket.2000 * Mike <pall@rz.uni-karlsruhe.de>2001 */2002 return -EAGAIN;
2003 }2004
2005
2006 /*2007 * This routine copies from a sock struct into the user buffer. 2008 */2009
2010 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */2011 intlen, intnonblock, unsignedflags)
2012 {2013 structwait_queuewait = {current, NULL};
2014 intcopied = 0;
2015 unsignedlongpeek_seq;
2016 volatileunsignedlong *seq; /* So gcc doesnt overoptimise */2017 unsignedlongused;
2018
2019 /* 2020 * This error should be checked. 2021 */2022
2023 if (sk->state == TCP_LISTEN)
2024 return -ENOTCONN;
2025
2026 /*2027 * Urgent data needs to be handled specially. 2028 */2029
2030 if (flags & MSG_OOB)
2031 returntcp_read_urg(sk, nonblock, to, len, flags);
2032
2033 /*2034 * Copying sequence to update. This is volatile to handle2035 * the multi-reader case neatly (memcpy_to/fromfs might be 2036 * inline and thus not flush cached variables otherwise).2037 */2038
2039 peek_seq = sk->copied_seq;
2040 seq = &sk->copied_seq;
2041 if (flags & MSG_PEEK)
2042 seq = &peek_seq;
2043
2044 add_wait_queue(sk->sleep, &wait);
2045 sk->inuse = 1;
2046 while (len > 0)
2047 {2048 structsk_buff * skb;
2049 unsignedlongoffset;
2050
2051 /*2052 * Are we at urgent data? Stop if we have read anything.2053 */2054
2055 if (copied && sk->urg_data && sk->urg_seq == *seq)
2056 break;
2057
2058 /*2059 * Next get a buffer.2060 */2061
2062 current->state = TASK_INTERRUPTIBLE;
2063
2064 skb = skb_peek(&sk->receive_queue);
2065 do2066 {2067 if (!skb)
2068 break;
2069 if (before(*seq, skb->h.th->seq))
2070 break;
2071 offset = *seq - skb->h.th->seq;
2072 if (skb->h.th->syn)
2073 offset--;
2074 if (offset < skb->len)
2075 gotofound_ok_skb;
2076 if (skb->h.th->fin)
2077 gotofound_fin_ok;
2078 if (!(flags & MSG_PEEK))
2079 skb->used = 1;
2080 skb = skb->next;
2081 }2082 while (skb != (structsk_buff *)&sk->receive_queue);
2083
2084 if (copied)
2085 break;
2086
2087 if (sk->err)
2088 {2089 copied = -sk->err;
2090 sk->err = 0;
2091 break;
2092 }2093
2094 if (sk->state == TCP_CLOSE)
2095 {2096 if (!sk->done)
2097 {2098 sk->done = 1;
2099 break;
2100 }2101 copied = -ENOTCONN;
2102 break;
2103 }2104
2105 if (sk->shutdown & RCV_SHUTDOWN)
2106 {2107 sk->done = 1;
2108 break;
2109 }2110
2111 if (nonblock)
2112 {2113 copied = -EAGAIN;
2114 break;
2115 }2116
2117 cleanup_rbuf(sk);
2118 release_sock(sk);
2119 sk->socket->flags |= SO_WAITDATA;
2120 schedule();
2121 sk->socket->flags &= ~SO_WAITDATA;
2122 sk->inuse = 1;
2123
2124 if (current->signal & ~current->blocked)
2125 {2126 copied = -ERESTARTSYS;
2127 break;
2128 }2129 continue;
2130
2131 found_ok_skb:
2132 /*2133 * Lock the buffer. We can be fairly relaxed as2134 * an interrupt will never steal a buffer we are 2135 * using unless I've missed something serious in2136 * tcp_data.2137 */2138
2139 skb->users++;
2140
2141 /*2142 * Ok so how much can we use ? 2143 */2144
2145 used = skb->len - offset;
2146 if (len < used)
2147 used = len;
2148 /*2149 * Do we have urgent data here? 2150 */2151
2152 if (sk->urg_data)
2153 {2154 unsignedlongurg_offset = sk->urg_seq - *seq;
2155 if (urg_offset < used)
2156 {2157 if (!urg_offset)
2158 {2159 if (!sk->urginline)
2160 {2161 ++*seq;
2162 offset++;
2163 used--;
2164 }2165 }2166 else2167 used = urg_offset;
2168 }2169 }2170
2171 /*2172 * Copy it - We _MUST_ update *seq first so that we2173 * don't ever double read when we have dual readers2174 */2175
2176 *seq += used;
2177
2178 /*2179 * This memcpy_tofs can sleep. If it sleeps and we2180 * do a second read it relies on the skb->users to avoid2181 * a crash when cleanup_rbuf() gets called.2182 */2183
2184 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2185 skb->h.th->doff*4 + offset, used);
2186 copied += used;
2187 len -= used;
2188 to += used;
2189
2190 /*2191 * We now will not sleep again until we are finished2192 * with skb. Sorry if you are doing the SMP port2193 * but you'll just have to fix it neatly ;)2194 */2195
2196 skb->users --;
2197
2198 if (after(sk->copied_seq,sk->urg_seq))
2199 sk->urg_data = 0;
2200 if (used + offset < skb->len)
2201 continue;
2202
2203 /*2204 * Process the FIN.2205 */2206
2207 if (skb->h.th->fin)
2208 gotofound_fin_ok;
2209 if (flags & MSG_PEEK)
2210 continue;
2211 skb->used = 1;
2212 continue;
2213
2214 found_fin_ok:
2215 ++*seq;
2216 if (flags & MSG_PEEK)
2217 break;
2218
2219 /*2220 * All is done2221 */2222
2223 skb->used = 1;
2224 sk->shutdown |= RCV_SHUTDOWN;
2225 break;
2226
2227 }2228 remove_wait_queue(sk->sleep, &wait);
2229 current->state = TASK_RUNNING;
2230
2231 /* Clean up data we have read: This will do ACK frames */2232 cleanup_rbuf(sk);
2233 release_sock(sk);
2234 returncopied;
2235 }2236
2237 /*2238 * State processing on a close. This implements the state shift for2239 * sending our FIN frame. Note that we only send a FIN for some 2240 * states. A shutdown() may have already sent the FIN, or we may be2241 * closed.2242 */2243
2244 staticinttcp_close_state(structsock *sk)
/* */2245 {2246 intns=TCP_CLOSE;
2247 intsend_fin=0;
2248 switch(sk->state)
2249 {2250 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2251 break;
2252 caseTCP_SYN_RECV:
2253 caseTCP_ESTABLISHED: /* Closedown begin */2254 ns=TCP_FIN_WAIT1;
2255 send_fin=1;
2256 break;
2257 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2258 caseTCP_FIN_WAIT2:
2259 caseTCP_CLOSING:
2260 ns=sk->state;
2261 break;
2262 caseTCP_CLOSE:
2263 caseTCP_LISTEN:
2264 break;
2265 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2266 wait only for the ACK */2267 ns=TCP_LAST_ACK;
2268 send_fin=1;
2269 }2270
2271 tcp_set_state(sk,ns);
2272
2273 /*2274 * This is a (useful) BSD violating of the RFC. There is a2275 * problem with TCP as specified in that the other end could2276 * keep a socket open forever with no application left this end.2277 * We use a 3 minute timeout (about the same as BSD) then kill2278 * our end. If they send after that then tough - BUT: long enough2279 * that we won't make the old 4*rto = almost no time - whoops2280 * reset mistake.2281 */2282 if(sk->dead && ns==TCP_FIN_WAIT2)
2283 {2284 inttimer_active=del_timer(&sk->timer);
2285 if(timer_active)
2286 add_timer(&sk->timer);
2287 else2288 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2289 }2290
2291 returnsend_fin;
2292 }2293
2294 /*2295 * Send a fin.2296 */2297
2298 staticvoidtcp_send_fin(structsock *sk)
/* */2299 {2300 structproto *prot =(structproto *)sk->prot;
2301 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2302 structtcphdr *t1;
2303 structsk_buff *buff;
2304 structdevice *dev=NULL;
2305 inttmp;
2306
2307 release_sock(sk); /* in case the malloc sleeps. */2308
2309 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2310 sk->inuse = 1;
2311
2312 if (buff == NULL)
2313 {2314 /* This is a disaster if it occurs */2315 printk("tcp_send_fin: Impossible malloc failure");
2316 return;
2317 }2318
2319 /*2320 * Administrivia2321 */2322
2323 buff->sk = sk;
2324 buff->len = sizeof(*t1);
2325 buff->localroute = sk->localroute;
2326 t1 =(structtcphdr *) buff->data;
2327
2328 /*2329 * Put in the IP header and routing stuff. 2330 */2331
2332 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2333 IPPROTO_TCP, sk->opt,
2334 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2335 if (tmp < 0)
2336 {2337 /*2338 * Finish anyway, treat this as a send that got lost. 2339 * (Not good).2340 */2341
2342 buff->free = 1;
2343 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2344 sk->write_seq++;
2345 return;
2346 }2347
2348 /*2349 * We ought to check if the end of the queue is a buffer and2350 * if so simply add the fin to that buffer, not send it ahead.2351 */2352
2353 t1 =(structtcphdr *)((char *)t1 +tmp);
2354 buff->len += tmp;
2355 buff->dev = dev;
2356 memcpy(t1, th, sizeof(*t1));
2357 t1->seq = ntohl(sk->write_seq);
2358 sk->write_seq++;
2359 buff->h.seq = sk->write_seq;
2360 t1->ack = 1;
2361 t1->ack_seq = ntohl(sk->acked_seq);
2362 t1->window = ntohs(sk->window=tcp_select_window(sk));
2363 t1->fin = 1;
2364 t1->rst = 0;
2365 t1->doff = sizeof(*t1)/4;
2366 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2367
2368 /*2369 * If there is data in the write queue, the fin must be appended to2370 * the write queue.2371 */2372
2373 if (skb_peek(&sk->write_queue) != NULL)
2374 {2375 buff->free = 0;
2376 if (buff->next != NULL)
2377 {2378 printk("tcp_send_fin: next != NULL\n");
2379 skb_unlink(buff);
2380 }2381 skb_queue_tail(&sk->write_queue, buff);
2382 }2383 else2384 {2385 sk->sent_seq = sk->write_seq;
2386 sk->prot->queue_xmit(sk, dev, buff, 0);
2387 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2388 }2389 }2390
2391 /*2392 * Shutdown the sending side of a connection. Much like close except2393 * that we don't receive shut down or set sk->dead=1.2394 */2395
2396 voidtcp_shutdown(structsock *sk, inthow)
/* */2397 {2398 /*2399 * We need to grab some memory, and put together a FIN,2400 * and then put it into the queue to be sent.2401 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2402 */2403
2404 if (!(how & SEND_SHUTDOWN))
2405 return;
2406
2407 /*2408 * If we've already sent a FIN, or its a closed state2409 */2410
2411 if (sk->state == TCP_FIN_WAIT1 ||
2412 sk->state == TCP_FIN_WAIT2 ||
2413 sk->state == TCP_CLOSING ||
2414 sk->state == TCP_LAST_ACK ||
2415 sk->state == TCP_TIME_WAIT ||
2416 sk->state == TCP_CLOSE ||
2417 sk->state == TCP_LISTEN2418 )
2419 {2420 return;
2421 }2422 sk->inuse = 1;
2423
2424 /*2425 * flag that the sender has shutdown2426 */2427
2428 sk->shutdown |= SEND_SHUTDOWN;
2429
2430 /*2431 * Clear out any half completed packets. 2432 */2433
2434 if (sk->partial)
2435 tcp_send_partial(sk);
2436
2437 /*2438 * FIN if needed2439 */2440
2441 if(tcp_close_state(sk))
2442 tcp_send_fin(sk);
2443
2444 release_sock(sk);
2445 }2446
2447
2448 staticint2449 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2450 intto_len, intnonblock, unsignedflags,
2451 structsockaddr_in *addr, int *addr_len)
2452 {2453 intresult;
2454
2455 /* 2456 * Have to check these first unlike the old code. If 2457 * we check them after we lose data on an error2458 * which is wrong 2459 */2460
2461 if(addr_len)
2462 *addr_len = sizeof(*addr);
2463 result=tcp_read(sk, to, to_len, nonblock, flags);
2464
2465 if (result < 0)
2466 return(result);
2467
2468 if(addr)
2469 {2470 addr->sin_family = AF_INET;
2471 addr->sin_port = sk->dummy_th.dest;
2472 addr->sin_addr.s_addr = sk->daddr;
2473 }2474 return(result);
2475 }2476
2477
2478 /*2479 * This routine will send an RST to the other tcp. 2480 */2481
2482 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2483 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2484 {2485 structsk_buff *buff;
2486 structtcphdr *t1;
2487 inttmp;
2488 structdevice *ndev=NULL;
2489
2490 /*2491 * Cannot reset a reset (Think about it).2492 */2493
2494 if(th->rst)
2495 return;
2496
2497 /*2498 * We need to grab some memory, and put together an RST,2499 * and then put it into the queue to be sent.2500 */2501
2502 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2503 if (buff == NULL)
2504 return;
2505
2506 buff->len = sizeof(*t1);
2507 buff->sk = NULL;
2508 buff->dev = dev;
2509 buff->localroute = 0;
2510
2511 t1 =(structtcphdr *) buff->data;
2512
2513 /*2514 * Put in the IP header and routing stuff. 2515 */2516
2517 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2518 sizeof(structtcphdr),tos,ttl);
2519 if (tmp < 0)
2520 {2521 buff->free = 1;
2522 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2523 return;
2524 }2525
2526 t1 =(structtcphdr *)((char *)t1 +tmp);
2527 buff->len += tmp;
2528 memcpy(t1, th, sizeof(*t1));
2529
2530 /*2531 * Swap the send and the receive. 2532 */2533
2534 t1->dest = th->source;
2535 t1->source = th->dest;
2536 t1->rst = 1;
2537 t1->window = 0;
2538
2539 if(th->ack)
2540 {2541 t1->ack = 0;
2542 t1->seq = th->ack_seq;
2543 t1->ack_seq = 0;
2544 }2545 else2546 {2547 t1->ack = 1;
2548 if(!th->syn)
2549 t1->ack_seq=htonl(th->seq);
2550 else2551 t1->ack_seq=htonl(th->seq+1);
2552 t1->seq=0;
2553 }2554
2555 t1->syn = 0;
2556 t1->urg = 0;
2557 t1->fin = 0;
2558 t1->psh = 0;
2559 t1->doff = sizeof(*t1)/4;
2560 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2561 prot->queue_xmit(NULL, ndev, buff, 1);
2562 tcp_statistics.TcpOutSegs++;
2563 }2564
2565
2566 /*2567 * Look for tcp options. Parses everything but only knows about MSS.2568 * This routine is always called with the packet containing the SYN.2569 * However it may also be called with the ack to the SYN. So you2570 * can't assume this is always the SYN. It's always called after2571 * we have set up sk->mtu to our own MTU.2572 *2573 * We need at minimum to add PAWS support here. Possibly large windows2574 * as Linux gets deployed on 100Mb/sec networks.2575 */2576
2577 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2578 {2579 unsignedchar *ptr;
2580 intlength=(th->doff*4)-sizeof(structtcphdr);
2581 intmss_seen = 0;
2582
2583 ptr = (unsignedchar *)(th + 1);
2584
2585 while(length>0)
2586 {2587 intopcode=*ptr++;
2588 intopsize=*ptr++;
2589 switch(opcode)
2590 {2591 caseTCPOPT_EOL:
2592 return;
2593 caseTCPOPT_NOP:
2594 length-=2;
2595 continue;
2596
2597 default:
2598 if(opsize<=2) /* Avoid silly options looping forever */2599 return;
2600 switch(opcode)
2601 {2602 caseTCPOPT_MSS:
2603 if(opsize==4 && th->syn)
2604 {2605 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2606 mss_seen = 1;
2607 }2608 break;
2609 /* Add other options here as people feel the urge to implement stuff like large windows */2610 }2611 ptr+=opsize-2;
2612 length-=opsize;
2613 }2614 }2615 if (th->syn)
2616 {2617 if (! mss_seen)
2618 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2619 }2620 #ifdefCONFIG_INET_PCTCP2621 sk->mss = min(sk->max_window >> 1, sk->mtu);
2622 #else2623 sk->mss = min(sk->max_window, sk->mtu);
2624 #endif2625 }2626
2627 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2628 {2629 dst = ntohl(dst);
2630 if (IN_CLASSA(dst))
2631 returnhtonl(IN_CLASSA_NET);
2632 if (IN_CLASSB(dst))
2633 returnhtonl(IN_CLASSB_NET);
2634 returnhtonl(IN_CLASSC_NET);
2635 }2636
2637 /*2638 * Default sequence number picking algorithm.2639 */2640
2641 externinlinelongtcp_init_seq(void)
/* */2642 {2643 returnjiffies * SEQ_TICK - seq_offset;
2644 }2645
2646 /*2647 * This routine handles a connection request.2648 * It should make sure we haven't already responded.2649 * Because of the way BSD works, we have to send a syn/ack now.2650 * This also means it will be harder to close a socket which is2651 * listening.2652 */2653
2654 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2655 unsignedlongdaddr, unsignedlongsaddr,
2656 structoptions *opt, structdevice *dev, unsignedlongseq)
2657 {2658 structsk_buff *buff;
2659 structtcphdr *t1;
2660 unsignedchar *ptr;
2661 structsock *newsk;
2662 structtcphdr *th;
2663 structdevice *ndev=NULL;
2664 inttmp;
2665 structrtable *rt;
2666
2667 th = skb->h.th;
2668
2669 /* If the socket is dead, don't accept the connection. */2670 if (!sk->dead)
2671 {2672 sk->data_ready(sk,0);
2673 }2674 else2675 {2676 if(sk->debug)
2677 printk("Reset on %p: Connect on dead socket.\n",sk);
2678 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2679 tcp_statistics.TcpAttemptFails++;
2680 kfree_skb(skb, FREE_READ);
2681 return;
2682 }2683
2684 /*2685 * Make sure we can accept more. This will prevent a2686 * flurry of syns from eating up all our memory.2687 */2688
2689 if (sk->ack_backlog >= sk->max_ack_backlog)
2690 {2691 tcp_statistics.TcpAttemptFails++;
2692 kfree_skb(skb, FREE_READ);
2693 return;
2694 }2695
2696 /*2697 * We need to build a new sock struct.2698 * It is sort of bad to have a socket without an inode attached2699 * to it, but the wake_up's will just wake up the listening socket,2700 * and if the listening socket is destroyed before this is taken2701 * off of the queue, this will take care of it.2702 */2703
2704 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2705 if (newsk == NULL)
2706 {2707 /* just ignore the syn. It will get retransmitted. */2708 tcp_statistics.TcpAttemptFails++;
2709 kfree_skb(skb, FREE_READ);
2710 return;
2711 }2712
2713 memcpy(newsk, sk, sizeof(*newsk));
2714 skb_queue_head_init(&newsk->write_queue);
2715 skb_queue_head_init(&newsk->receive_queue);
2716 newsk->send_head = NULL;
2717 newsk->send_tail = NULL;
2718 skb_queue_head_init(&newsk->back_log);
2719 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2720 newsk->rto = TCP_TIMEOUT_INIT;
2721 newsk->mdev = 0;
2722 newsk->max_window = 0;
2723 newsk->cong_window = 1;
2724 newsk->cong_count = 0;
2725 newsk->ssthresh = 0;
2726 newsk->backoff = 0;
2727 newsk->blog = 0;
2728 newsk->intr = 0;
2729 newsk->proc = 0;
2730 newsk->done = 0;
2731 newsk->partial = NULL;
2732 newsk->pair = NULL;
2733 newsk->wmem_alloc = 0;
2734 newsk->rmem_alloc = 0;
2735 newsk->localroute = sk->localroute;
2736
2737 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2738
2739 newsk->err = 0;
2740 newsk->shutdown = 0;
2741 newsk->ack_backlog = 0;
2742 newsk->acked_seq = skb->h.th->seq+1;
2743 newsk->copied_seq = skb->h.th->seq+1;
2744 newsk->fin_seq = skb->h.th->seq;
2745 newsk->state = TCP_SYN_RECV;
2746 newsk->timeout = 0;
2747 newsk->ip_xmit_timeout = 0;
2748 newsk->write_seq = seq;
2749 newsk->window_seq = newsk->write_seq;
2750 newsk->rcv_ack_seq = newsk->write_seq;
2751 newsk->urg_data = 0;
2752 newsk->retransmits = 0;
2753 newsk->linger=0;
2754 newsk->destroy = 0;
2755 init_timer(&newsk->timer);
2756 init_timer(&newsk->retransmit_timer);
2757 newsk->timer.data = (unsignedlong)newsk;
2758 newsk->timer.function = &net_timer;
2759 newsk->retransmit_timer.data = (unsignedlong)newsk;
2760 newsk->retransmit_timer.function=&retransmit_timer;
2761 newsk->dummy_th.source = skb->h.th->dest;
2762 newsk->dummy_th.dest = skb->h.th->source;
2763
2764 /*2765 * Swap these two, they are from our point of view. 2766 */2767
2768 newsk->daddr = saddr;
2769 newsk->saddr = daddr;
2770
2771 put_sock(newsk->num,newsk);
2772 newsk->dummy_th.res1 = 0;
2773 newsk->dummy_th.doff = 6;
2774 newsk->dummy_th.fin = 0;
2775 newsk->dummy_th.syn = 0;
2776 newsk->dummy_th.rst = 0;
2777 newsk->dummy_th.psh = 0;
2778 newsk->dummy_th.ack = 0;
2779 newsk->dummy_th.urg = 0;
2780 newsk->dummy_th.res2 = 0;
2781 newsk->acked_seq = skb->h.th->seq + 1;
2782 newsk->copied_seq = skb->h.th->seq + 1;
2783 newsk->socket = NULL;
2784
2785 /*2786 * Grab the ttl and tos values and use them 2787 */2788
2789 newsk->ip_ttl=sk->ip_ttl;
2790 newsk->ip_tos=skb->ip_hdr->tos;
2791
2792 /*2793 * Use 512 or whatever user asked for 2794 */2795
2796 /*2797 * Note use of sk->user_mss, since user has no direct access to newsk 2798 */2799
2800 rt=ip_rt_route(saddr, NULL,NULL);
2801
2802 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2803 newsk->window_clamp = rt->rt_window;
2804 else2805 newsk->window_clamp = 0;
2806
2807 if (sk->user_mss)
2808 newsk->mtu = sk->user_mss;
2809 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
2810 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2811 else2812 {2813 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */2814 if ((saddr ^ daddr) & default_mask(saddr))
2815 #else2816 if ((saddr ^ daddr) & dev->pa_mask)
2817 #endif2818 newsk->mtu = 576 - HEADER_SIZE;
2819 else2820 newsk->mtu = MAX_WINDOW;
2821 }2822
2823 /*2824 * But not bigger than device MTU 2825 */2826
2827 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2828
2829 /*2830 * This will min with what arrived in the packet 2831 */2832
2833 tcp_options(newsk,skb->h.th);
2834
2835 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2836 if (buff == NULL)
2837 {2838 sk->err = -ENOMEM;
2839 newsk->dead = 1;
2840 release_sock(newsk);
2841 kfree_skb(skb, FREE_READ);
2842 tcp_statistics.TcpAttemptFails++;
2843 return;
2844 }2845
2846 buff->len = sizeof(structtcphdr)+4;
2847 buff->sk = newsk;
2848 buff->localroute = newsk->localroute;
2849
2850 t1 =(structtcphdr *) buff->data;
2851
2852 /*2853 * Put in the IP header and routing stuff. 2854 */2855
2856 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2857 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2858
2859 /*2860 * Something went wrong. 2861 */2862
2863 if (tmp < 0)
2864 {2865 sk->err = tmp;
2866 buff->free = 1;
2867 kfree_skb(buff,FREE_WRITE);
2868 newsk->dead = 1;
2869 release_sock(newsk);
2870 skb->sk = sk;
2871 kfree_skb(skb, FREE_READ);
2872 tcp_statistics.TcpAttemptFails++;
2873 return;
2874 }2875
2876 buff->len += tmp;
2877 t1 =(structtcphdr *)((char *)t1 +tmp);
2878
2879 memcpy(t1, skb->h.th, sizeof(*t1));
2880 buff->h.seq = newsk->write_seq;
2881 /*2882 * Swap the send and the receive. 2883 */2884 t1->dest = skb->h.th->source;
2885 t1->source = newsk->dummy_th.source;
2886 t1->seq = ntohl(newsk->write_seq++);
2887 t1->ack = 1;
2888 newsk->window = tcp_select_window(newsk);
2889 newsk->sent_seq = newsk->write_seq;
2890 t1->window = ntohs(newsk->window);
2891 t1->res1 = 0;
2892 t1->res2 = 0;
2893 t1->rst = 0;
2894 t1->urg = 0;
2895 t1->psh = 0;
2896 t1->syn = 1;
2897 t1->ack_seq = ntohl(skb->h.th->seq+1);
2898 t1->doff = sizeof(*t1)/4+1;
2899 ptr =(unsignedchar *)(t1+1);
2900 ptr[0] = 2;
2901 ptr[1] = 4;
2902 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2903 ptr[3] =(newsk->mtu) & 0xff;
2904
2905 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2906 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2907 reset_xmit_timer(newsk, TIME_WRITE, newsk->rto);
2908
2909 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2910 skb->sk = newsk;
2911
2912 /*2913 * Charge the sock_buff to newsk. 2914 */2915
2916 sk->rmem_alloc -= skb->mem_len;
2917 newsk->rmem_alloc += skb->mem_len;
2918
2919 skb_queue_tail(&sk->receive_queue,skb);
2920 sk->ack_backlog++;
2921 release_sock(newsk);
2922 tcp_statistics.TcpOutSegs++;
2923 }2924
2925
2926 staticvoidtcp_close(structsock *sk, inttimeout)
/* */2927 {2928 /*2929 * We need to grab some memory, and put together a FIN, 2930 * and then put it into the queue to be sent.2931 */2932
2933 sk->inuse = 1;
2934
2935 if(sk->state == TCP_LISTEN)
2936 {2937 /* Special case */2938 tcp_set_state(sk, TCP_CLOSE);
2939 tcp_close_pending(sk, timeout);
2940 release_sock(sk);
2941 return;
2942 }2943
2944 sk->keepopen = 1;
2945 sk->shutdown = SHUTDOWN_MASK;
2946
2947 if (!sk->dead)
2948 sk->state_change(sk);
2949
2950 if (timeout == 0)
2951 {2952 structsk_buff *skb;
2953
2954 /*2955 * We need to flush the recv. buffs. We do this only on the2956 * descriptor close, not protocol-sourced closes, because the2957 * reader process may not have drained the data yet!2958 */2959
2960 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2961 kfree_skb(skb, FREE_READ);
2962 }2963
2964 /*2965 * Get rid off any half-completed packets. 2966 */2967
2968 if (sk->partial)
2969 tcp_send_partial(sk);
2970
2971 /*2972 * Timeout is not the same thing - however the code likes2973 * to send both the same way (sigh).2974 */2975
2976 if(timeout)
2977 {2978 /*2979 * Time wait to avoid port reusage accidents if 2980 * appropriate. If we have timed out from one2981 * of these states then move straight to close.2982 */2983
2984 if( sk->state == TCP_TIME_WAIT || sk->state == TCP_LAST_ACK2985 || sk->state == TCP_SYN_SENT || sk->state == TCP_CLOSE)
2986 tcp_set_state(sk, TCP_CLOSE); /* Dead */2987 else2988 tcp_time_wait(sk);
2989 }2990 else2991 {2992 if(tcp_close_state(sk)==1)
2993 {2994 tcp_send_fin(sk);
2995 }2996 }2997 release_sock(sk);
2998 }2999
3000
3001 /*3002 * This routine takes stuff off of the write queue,3003 * and puts it in the xmit queue. This happens as incoming acks3004 * open up the remote window for us.3005 */3006
3007 staticvoidtcp_write_xmit(structsock *sk)
/* */3008 {3009 structsk_buff *skb;
3010
3011 /*3012 * The bytes will have to remain here. In time closedown will3013 * empty the write queue and all will be happy 3014 */3015
3016 if(sk->zapped)
3017 return;
3018
3019 /*3020 * Anything on the transmit queue that fits the window can3021 * be added providing we are not3022 *3023 * a) retransmitting (Nagle's rule)3024 * b) exceeding our congestion window.3025 */3026
3027 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3028 before(skb->h.seq, sk->window_seq + 1) &&
3029 (sk->retransmits == 0 ||
3030 sk->ip_xmit_timeout != TIME_WRITE ||
3031 before(skb->h.seq, sk->rcv_ack_seq + 1))
3032 && sk->packets_out < sk->cong_window)
3033 {3034 IS_SKB(skb);
3035 skb_unlink(skb);
3036
3037 /*3038 * See if we really need to send the packet. 3039 */3040
3041 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3042 {3043 /*3044 * This is acked data. We can discard it. This 3045 * cannot currently occur.3046 */3047
3048 sk->retransmits = 0;
3049 kfree_skb(skb, FREE_WRITE);
3050 if (!sk->dead)
3051 sk->write_space(sk);
3052 }3053 else3054 {3055 structtcphdr *th;
3056 structiphdr *iph;
3057 intsize;
3058 /*3059 * put in the ack seq and window at this point rather than earlier,3060 * in order to keep them monotonic. We really want to avoid taking3061 * back window allocations. That's legal, but RFC1122 says it's frowned on.3062 * Ack and window will in general have changed since this packet was put3063 * on the write queue.3064 */3065 iph = (structiphdr *)(skb->data +
3066 skb->dev->hard_header_len);
3067 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3068 size = skb->len - (((unsignedchar *) th) - skb->data);
3069
3070 th->ack_seq = ntohl(sk->acked_seq);
3071 th->window = ntohs(tcp_select_window(sk));
3072
3073 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3074
3075 sk->sent_seq = skb->h.seq;
3076
3077 /*3078 * IP manages our queue for some crazy reason3079 */3080
3081 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3082
3083 /*3084 * Again we slide the timer wrongly3085 */3086
3087 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3088 }3089 }3090 }3091
3092
3093 /*3094 * This routine deals with incoming acks, but not outgoing ones.3095 */3096
3097 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3098 {3099 unsignedlongack;
3100 intflag = 0;
3101
3102 /* 3103 * 1 - there was data in packet as well as ack or new data is sent or 3104 * in shutdown state3105 * 2 - data from retransmit queue was acked and removed3106 * 4 - window shrunk or data from retransmit queue was acked and removed3107 */3108
3109 if(sk->zapped)
3110 return(1); /* Dead, cant ack any more so why bother */3111
3112 /*3113 * Have we discovered a larger window3114 */3115
3116 ack = ntohl(th->ack_seq);
3117
3118 if (ntohs(th->window) > sk->max_window)
3119 {3120 sk->max_window = ntohs(th->window);
3121 #ifdefCONFIG_INET_PCTCP3122 /* Hack because we don't send partial packets to non SWS3123 handling hosts */3124 sk->mss = min(sk->max_window>>1, sk->mtu);
3125 #else3126 sk->mss = min(sk->max_window, sk->mtu);
3127 #endif3128 }3129
3130 /*3131 * We have dropped back to keepalive timeouts. Thus we have3132 * no retransmits pending.3133 */3134
3135 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3136 sk->retransmits = 0;
3137
3138 /*3139 * If the ack is newer than sent or older than previous acks3140 * then we can probably ignore it.3141 */3142
3143 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3144 {3145 if(sk->debug)
3146 printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3147
3148 /*3149 * Keepalive processing.3150 */3151
3152 if (after(ack, sk->sent_seq))
3153 {3154 return(0);
3155 }3156
3157 /*3158 * Restart the keepalive timer.3159 */3160
3161 if (sk->keepopen)
3162 {3163 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3164 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3165 }3166 return(1);
3167 }3168
3169 /*3170 * If there is data set flag 13171 */3172
3173 if (len != th->doff*4)
3174 flag |= 1;
3175
3176 /*3177 * See if our window has been shrunk. 3178 */3179
3180 if (after(sk->window_seq, ack+ntohs(th->window)))
3181 {3182 /*3183 * We may need to move packets from the send queue3184 * to the write queue, if the window has been shrunk on us.3185 * The RFC says you are not allowed to shrink your window3186 * like this, but if the other end does, you must be able3187 * to deal with it.3188 */3189 structsk_buff *skb;
3190 structsk_buff *skb2;
3191 structsk_buff *wskb = NULL;
3192
3193 skb2 = sk->send_head;
3194 sk->send_head = NULL;
3195 sk->send_tail = NULL;
3196
3197 /*3198 * This is an artifact of a flawed concept. We want one3199 * queue and a smarter send routine when we send all.3200 */3201
3202 flag |= 4; /* Window changed */3203
3204 sk->window_seq = ack + ntohs(th->window);
3205 cli();
3206 while (skb2 != NULL)
3207 {3208 skb = skb2;
3209 skb2 = skb->link3;
3210 skb->link3 = NULL;
3211 if (after(skb->h.seq, sk->window_seq))
3212 {3213 if (sk->packets_out > 0)
3214 sk->packets_out--;
3215 /* We may need to remove this from the dev send list. */3216 if (skb->next != NULL)
3217 {3218 skb_unlink(skb);
3219 }3220 /* Now add it to the write_queue. */3221 if (wskb == NULL)
3222 skb_queue_head(&sk->write_queue,skb);
3223 else3224 skb_append(wskb,skb);
3225 wskb = skb;
3226 }3227 else3228 {3229 if (sk->send_head == NULL)
3230 {3231 sk->send_head = skb;
3232 sk->send_tail = skb;
3233 }3234 else3235 {3236 sk->send_tail->link3 = skb;
3237 sk->send_tail = skb;
3238 }3239 skb->link3 = NULL;
3240 }3241 }3242 sti();
3243 }3244
3245 /*3246 * Pipe has emptied3247 */3248
3249 if (sk->send_tail == NULL || sk->send_head == NULL)
3250 {3251 sk->send_head = NULL;
3252 sk->send_tail = NULL;
3253 sk->packets_out= 0;
3254 }3255
3256 /*3257 * Update the right hand window edge of the host3258 */3259
3260 sk->window_seq = ack + ntohs(th->window);
3261
3262 /*3263 * We don't want too many packets out there. 3264 */3265
3266 if (sk->ip_xmit_timeout == TIME_WRITE &&
3267 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3268 {3269 /* 3270 * This is Jacobson's slow start and congestion avoidance. 3271 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3272 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3273 * counter and increment it once every cwnd times. It's possible3274 * that this should be done only if sk->retransmits == 0. I'm3275 * interpreting "new data is acked" as including data that has3276 * been retransmitted but is just now being acked.3277 */3278 if (sk->cong_window < sk->ssthresh)
3279 /* 3280 * In "safe" area, increase3281 */3282 sk->cong_window++;
3283 else3284 {3285 /*3286 * In dangerous area, increase slowly. In theory this is3287 * sk->cong_window += 1 / sk->cong_window3288 */3289 if (sk->cong_count >= sk->cong_window)
3290 {3291 sk->cong_window++;
3292 sk->cong_count = 0;
3293 }3294 else3295 sk->cong_count++;
3296 }3297 }3298
3299 /*3300 * Remember the highest ack received.3301 */3302
3303 sk->rcv_ack_seq = ack;
3304
3305 /*3306 * If this ack opens up a zero window, clear backoff. It was3307 * being used to time the probes, and is probably far higher than3308 * it needs to be for normal retransmission.3309 */3310
3311 if (sk->ip_xmit_timeout == TIME_PROBE0)
3312 {3313 sk->retransmits = 0; /* Our probe was answered */3314
3315 /*3316 * Was it a usable window open ?3317 */3318
3319 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3320 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3321 {3322 sk->backoff = 0;
3323
3324 /*3325 * Recompute rto from rtt. this eliminates any backoff.3326 */3327
3328 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3329 if (sk->rto > 120*HZ)
3330 sk->rto = 120*HZ;
3331 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3332 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3333 .2 of a second is going to need huge windows (SIGH) */3334 sk->rto = 20;
3335 }3336 }3337
3338 /* 3339 * See if we can take anything off of the retransmit queue.3340 */3341
3342 while(sk->send_head != NULL)
3343 {3344 /* Check for a bug. */3345 if (sk->send_head->link3 &&
3346 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3347 printk("INET: tcp.c: *** bug send_list out of order.\n");
3348
3349 /*3350 * If our packet is before the ack sequence we can3351 * discard it as its confirmed to have arrived the other end.3352 */3353
3354 if (before(sk->send_head->h.seq, ack+1))
3355 {3356 structsk_buff *oskb;
3357 if (sk->retransmits)
3358 {3359 /*3360 * We were retransmitting. don't count this in RTT est 3361 */3362 flag |= 2;
3363
3364 /*3365 * even though we've gotten an ack, we're still3366 * retransmitting as long as we're sending from3367 * the retransmit queue. Keeping retransmits non-zero3368 * prevents us from getting new data interspersed with3369 * retransmissions.3370 */3371
3372 if (sk->send_head->link3) /* Any more queued retransmits? */3373 sk->retransmits = 1;
3374 else3375 sk->retransmits = 0;
3376 }3377 /*3378 * Note that we only reset backoff and rto in the3379 * rtt recomputation code. And that doesn't happen3380 * if there were retransmissions in effect. So the3381 * first new packet after the retransmissions is3382 * sent with the backoff still in effect. Not until3383 * we get an ack from a non-retransmitted packet do3384 * we reset the backoff and rto. This allows us to deal3385 * with a situation where the network delay has increased3386 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3387 */3388
3389 /*3390 * We have one less packet out there. 3391 */3392
3393 if (sk->packets_out > 0)
3394 sk->packets_out --;
3395 /* 3396 * Wake up the process, it can probably write more. 3397 */3398 if (!sk->dead)
3399 sk->write_space(sk);
3400 oskb = sk->send_head;
3401
3402 if (!(flag&2)) /* Not retransmitting */3403 {3404 longm;
3405
3406 /*3407 * The following amusing code comes from Jacobson's3408 * article in SIGCOMM '88. Note that rtt and mdev3409 * are scaled versions of rtt and mean deviation.3410 * This is designed to be as fast as possible 3411 * m stands for "measurement".3412 */3413
3414 m = jiffies - oskb->when; /* RTT */3415 if(m<=0)
3416 m=1; /* IS THIS RIGHT FOR <0 ??? */3417 m -= (sk->rtt >> 3); /* m is now error in rtt est */3418 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3419 if (m < 0)
3420 m = -m; /* m is now abs(error) */3421 m -= (sk->mdev >> 2); /* similar update on mdev */3422 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3423
3424 /*3425 * Now update timeout. Note that this removes any backoff.3426 */3427
3428 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3429 if (sk->rto > 120*HZ)
3430 sk->rto = 120*HZ;
3431 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3432 sk->rto = 20;
3433 sk->backoff = 0;
3434 }3435 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3436 In this case as we just set it up */3437 cli();
3438 oskb = sk->send_head;
3439 IS_SKB(oskb);
3440 sk->send_head = oskb->link3;
3441 if (sk->send_head == NULL)
3442 {3443 sk->send_tail = NULL;
3444 }3445
3446 /*3447 * We may need to remove this from the dev send list. 3448 */3449
3450 if (oskb->next)
3451 skb_unlink(oskb);
3452 sti();
3453 kfree_skb(oskb, FREE_WRITE); /* write. */3454 if (!sk->dead)
3455 sk->write_space(sk);
3456 }3457 else3458 {3459 break;
3460 }3461 }3462
3463 /*3464 * XXX someone ought to look at this too.. at the moment, if skb_peek()3465 * returns non-NULL, we complete ignore the timer stuff in the else3466 * clause. We ought to organize the code so that else clause can3467 * (should) be executed regardless, possibly moving the PROBE timer3468 * reset over. The skb_peek() thing should only move stuff to the3469 * write queue, NOT also manage the timer functions.3470 */3471
3472 /*3473 * Maybe we can take some stuff off of the write queue,3474 * and put it onto the xmit queue.3475 */3476 if (skb_peek(&sk->write_queue) != NULL)
3477 {3478 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3479 (sk->retransmits == 0 ||
3480 sk->ip_xmit_timeout != TIME_WRITE ||
3481 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3482 && sk->packets_out < sk->cong_window)
3483 {3484 /*3485 * Add more data to the send queue.3486 */3487 flag |= 1;
3488 tcp_write_xmit(sk);
3489 }3490 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3491 sk->send_head == NULL &&
3492 sk->ack_backlog == 0 &&
3493 sk->state != TCP_TIME_WAIT)
3494 {3495 /*3496 * Data to queue but no room.3497 */3498 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3499 }3500 }3501 else3502 {3503 /*3504 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3505 * from TCP_CLOSE we don't do anything3506 *3507 * from anything else, if there is write data (or fin) pending,3508 * we use a TIME_WRITE timeout, else if keepalive we reset to3509 * a KEEPALIVE timeout, else we delete the timer.3510 *3511 * We do not set flag for nominal write data, otherwise we may3512 * force a state where we start to write itsy bitsy tidbits3513 * of data.3514 */3515
3516 switch(sk->state) {3517 caseTCP_TIME_WAIT:
3518 /*3519 * keep us in TIME_WAIT until we stop getting packets,3520 * reset the timeout.3521 */3522 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3523 break;
3524 caseTCP_CLOSE:
3525 /*3526 * don't touch the timer.3527 */3528 break;
3529 default:
3530 /*3531 * Must check send_head, write_queue, and ack_backlog3532 * to determine which timeout to use.3533 */3534 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3535 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3536 }elseif (sk->keepopen) {3537 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3538 }else{3539 del_timer(&sk->retransmit_timer);
3540 sk->ip_xmit_timeout = 0;
3541 }3542 break;
3543 }3544 }3545
3546 /*3547 * We have nothing queued but space to send. Send any partial3548 * packets immediately (end of Nagle rule application).3549 */3550
3551 if (sk->packets_out == 0 && sk->partial != NULL &&
3552 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3553 {3554 flag |= 1;
3555 tcp_send_partial(sk);
3556 }3557
3558 /*3559 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3560 * we are now waiting for an acknowledge to our FIN. The other end is3561 * already in TIME_WAIT.3562 *3563 * Move to TCP_CLOSE on success.3564 */3565
3566 if (sk->state == TCP_LAST_ACK)
3567 {3568 if (!sk->dead)
3569 sk->state_change(sk);
3570 if(sk->debug)
3571 printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3572 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3573 if (sk->rcv_ack_seq == sk->write_seq && sk->acked_seq == sk->fin_seq)
3574 {3575 flag |= 1;
3576 tcp_set_state(sk,TCP_CLOSE);
3577 sk->shutdown = SHUTDOWN_MASK;
3578 }3579 }3580
3581 /*3582 * Incoming ACK to a FIN we sent in the case of our initiating the close.3583 *3584 * Move to FIN_WAIT2 to await a FIN from the other end. Set3585 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3586 */3587
3588 if (sk->state == TCP_FIN_WAIT1)
3589 {3590
3591 if (!sk->dead)
3592 sk->state_change(sk);
3593 if (sk->rcv_ack_seq == sk->write_seq)
3594 {3595 flag |= 1;
3596 sk->shutdown |= SEND_SHUTDOWN;
3597 tcp_set_state(sk, TCP_FIN_WAIT2);
3598 }3599 }3600
3601 /*3602 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3603 *3604 * Move to TIME_WAIT3605 */3606
3607 if (sk->state == TCP_CLOSING)
3608 {3609
3610 if (!sk->dead)
3611 sk->state_change(sk);
3612 if (sk->rcv_ack_seq == sk->write_seq)
3613 {3614 flag |= 1;
3615 tcp_time_wait(sk);
3616 }3617 }3618
3619 /*3620 * Final ack of a three way shake 3621 */3622
3623 if(sk->state==TCP_SYN_RECV)
3624 {3625 tcp_set_state(sk, TCP_ESTABLISHED);
3626 tcp_options(sk,th);
3627 sk->dummy_th.dest=th->source;
3628 sk->copied_seq = sk->acked_seq;
3629 if(!sk->dead)
3630 sk->state_change(sk);
3631 if(sk->max_window==0)
3632 {3633 sk->max_window=32; /* Sanity check */3634 sk->mss=min(sk->max_window,sk->mtu);
3635 }3636 }3637
3638 /*3639 * I make no guarantees about the first clause in the following3640 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3641 * what conditions "!flag" would be true. However I think the rest3642 * of the conditions would prevent that from causing any3643 * unnecessary retransmission. 3644 * Clearly if the first packet has expired it should be 3645 * retransmitted. The other alternative, "flag&2 && retransmits", is3646 * harder to explain: You have to look carefully at how and when the3647 * timer is set and with what timeout. The most recent transmission always3648 * sets the timer. So in general if the most recent thing has timed3649 * out, everything before it has as well. So we want to go ahead and3650 * retransmit some more. If we didn't explicitly test for this3651 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3652 * would not be true. If you look at the pattern of timing, you can3653 * show that rto is increased fast enough that the next packet would3654 * almost never be retransmitted immediately. Then you'd end up3655 * waiting for a timeout to send each packet on the retransmission3656 * queue. With my implementation of the Karn sampling algorithm,3657 * the timeout would double each time. The net result is that it would3658 * take a hideous amount of time to recover from a single dropped packet.3659 * It's possible that there should also be a test for TIME_WRITE, but3660 * I think as long as "send_head != NULL" and "retransmit" is on, we've3661 * got to be in real retransmission mode.3662 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3663 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3664 * As long as no further losses occur, this seems reasonable.3665 */3666
3667 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3668 (((flag&2) && sk->retransmits) ||
3669 (sk->send_head->when + sk->rto < jiffies)))
3670 {3671 if(sk->send_head->when + sk->rto < jiffies)
3672 tcp_retransmit(sk,0);
3673 else3674 {3675 tcp_do_retransmit(sk, 1);
3676 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3677 }3678 }3679
3680 return(1);
3681 }3682
3683
3684 /*3685 * Process the FIN bit. This now behaves as it is supposed to work3686 * and the FIN takes effect when it is validly part of sequence3687 * space. Not before when we get holes.3688 *3689 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3690 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3691 * TIME-WAIT)3692 *3693 * If we are in FINWAIT-1, a received FIN indicates simultaneous3694 * close and we go into CLOSING (and later onto TIME-WAIT)3695 *3696 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3697 *3698 */3699
3700 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3701 {3702 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3703
3704 if (!sk->dead)
3705 {3706 sk->state_change(sk);
3707 sock_wake_async(sk->socket, 1);
3708 }3709
3710 switch(sk->state)
3711 {3712 caseTCP_SYN_RECV:
3713 caseTCP_SYN_SENT:
3714 caseTCP_ESTABLISHED:
3715 /*3716 * move to CLOSE_WAIT, tcp_data() already handled3717 * sending the ack.3718 *//* Check me --------------vvvvvvv */3719 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3720 tcp_set_state(sk,TCP_CLOSE_WAIT);
3721 if (th->rst)
3722 sk->shutdown = SHUTDOWN_MASK;
3723 break;
3724
3725 caseTCP_CLOSE_WAIT:
3726 caseTCP_CLOSING:
3727 /*3728 * received a retransmission of the FIN, do3729 * nothing.3730 */3731 break;
3732 caseTCP_TIME_WAIT:
3733 /*3734 * received a retransmission of the FIN,3735 * restart the TIME_WAIT timer.3736 */3737 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3738 return(0);
3739 caseTCP_FIN_WAIT1:
3740 /*3741 * This case occurs when a simultaneous close3742 * happens, we must ack the received FIN and3743 * enter the CLOSING state.3744 *3745 * This causes a WRITE timeout, which will either3746 * move on to TIME_WAIT when we timeout, or resend3747 * the FIN properly (maybe we get rid of that annoying3748 * FIN lost hang). The TIME_WRITE code is already correct3749 * for handling this timeout.3750 */3751
3752 if(sk->ip_xmit_timeout != TIME_WRITE)
3753 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3754 tcp_set_state(sk,TCP_CLOSING);
3755 break;
3756 caseTCP_FIN_WAIT2:
3757 /*3758 * received a FIN -- send ACK and enter TIME_WAIT3759 */3760 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3761 sk->shutdown|=SHUTDOWN_MASK;
3762 tcp_set_state(sk,TCP_TIME_WAIT);
3763 break;
3764 caseTCP_CLOSE:
3765 /*3766 * already in CLOSE3767 */3768 break;
3769 default:
3770 tcp_set_state(sk,TCP_LAST_ACK);
3771
3772 /* Start the timers. */3773 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3774 return(0);
3775 }3776
3777 return(0);
3778 }3779
3780
3781
3782 /*3783 * This routine handles the data. If there is room in the buffer,3784 * it will be have already been moved into it. If there is no3785 * room, then we will just have to discard the packet.3786 */3787
3788 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */3789 unsignedlongsaddr, unsignedshortlen)
3790 {3791 structsk_buff *skb1, *skb2;
3792 structtcphdr *th;
3793 intdup_dumped=0;
3794 unsignedlongnew_seq;
3795 unsignedlongshut_seq;
3796
3797 th = skb->h.th;
3798 skb->len = len -(th->doff*4);
3799
3800 /*3801 * The bytes in the receive read/assembly queue has increased. Needed for the3802 * low memory discard algorithm 3803 */3804
3805 sk->bytes_rcv += skb->len;
3806
3807 if (skb->len == 0 && !th->fin && !th->urg && !th->psh)
3808 {3809 /* 3810 * Don't want to keep passing ack's back and forth. 3811 * (someone sent us dataless, boring frame)3812 */3813 if (!th->ack)
3814 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3815 kfree_skb(skb, FREE_READ);
3816 return(0);
3817 }3818
3819 /*3820 * We no longer have anyone receiving data on this connection.3821 */3822
3823 #ifndef TCP_DONT_RST_SHUTDOWN
3824
3825 if(sk->shutdown & RCV_SHUTDOWN)
3826 {3827 /*3828 * FIXME: BSD has some magic to avoid sending resets to3829 * broken 4.2 BSD keepalives. Much to my surprise a few non3830 * BSD stacks still have broken keepalives so we want to3831 * cope with it.3832 */3833
3834 if(skb->len) /* We don't care if its just an ack or3835 a keepalive/window probe */3836 {3837 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */3838
3839 /* Do this the way 4.4BSD treats it. Not what I'd3840 regard as the meaning of the spec but its what BSD3841 does and clearly they know everything 8) */3842
3843 /*3844 * This is valid because of two things3845 *3846 * a) The way tcp_data behaves at the bottom.3847 * b) A fin takes effect when read not when received.3848 */3849
3850 shut_seq=sk->acked_seq+1; /* Last byte */3851
3852 if(after(new_seq,shut_seq))
3853 {3854 if(sk->debug)
3855 printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3856 sk, new_seq, shut_seq, sk->blog);
3857 if(sk->dead)
3858 {3859 sk->acked_seq = new_seq + th->fin;
3860 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3861 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3862 tcp_statistics.TcpEstabResets++;
3863 tcp_set_state(sk,TCP_CLOSE);
3864 sk->err = EPIPE;
3865 sk->shutdown = SHUTDOWN_MASK;
3866 kfree_skb(skb, FREE_READ);
3867 return 0;
3868 }3869 }3870 }3871 }3872
3873 #endif3874
3875 /*3876 * Now we have to walk the chain, and figure out where this one3877 * goes into it. This is set up so that the last packet we received3878 * will be the first one we look at, that way if everything comes3879 * in order, there will be no performance loss, and if they come3880 * out of order we will be able to fit things in nicely.3881 *3882 * [AC: This is wrong. We should assume in order first and then walk3883 * forwards from the first hole based upon real traffic patterns.]3884 * 3885 */3886
3887 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */3888 {3889 skb_queue_head(&sk->receive_queue,skb);
3890 skb1= NULL;
3891 }3892 else3893 {3894 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3895 {3896 if(sk->debug)
3897 {3898 printk("skb1=%p :", skb1);
3899 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3900 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3901 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3902 sk->acked_seq);
3903 }3904
3905 /*3906 * Optimisation: Duplicate frame or extension of previous frame from3907 * same sequence point (lost ack case).3908 * The frame contains duplicate data or replaces a previous frame3909 * discard the previous frame (safe as sk->inuse is set) and put3910 * the new one in its place.3911 */3912
3913 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3914 {3915 skb_append(skb1,skb);
3916 skb_unlink(skb1);
3917 kfree_skb(skb1,FREE_READ);
3918 dup_dumped=1;
3919 skb1=NULL;
3920 break;
3921 }3922
3923 /*3924 * Found where it fits3925 */3926
3927 if (after(th->seq+1, skb1->h.th->seq))
3928 {3929 skb_append(skb1,skb);
3930 break;
3931 }3932
3933 /*3934 * See if we've hit the start. If so insert.3935 */3936 if (skb1 == skb_peek(&sk->receive_queue))
3937 {3938 skb_queue_head(&sk->receive_queue, skb);
3939 break;
3940 }3941 }3942 }3943
3944 /*3945 * Figure out what the ack value for this frame is3946 */3947
3948 th->ack_seq = th->seq + skb->len;
3949 if (th->syn)
3950 th->ack_seq++;
3951 if (th->fin)
3952 th->ack_seq++;
3953
3954 if (before(sk->acked_seq, sk->copied_seq))
3955 {3956 printk("*** tcp.c:tcp_data bug acked < copied\n");
3957 sk->acked_seq = sk->copied_seq;
3958 }3959
3960 /*3961 * Now figure out if we can ack anything. This is very messy because we really want two3962 * receive queues, a completed and an assembly queue. We also want only one transmit3963 * queue.3964 */3965
3966 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3967 {3968 if (before(th->seq, sk->acked_seq+1))
3969 {3970 intnewwindow;
3971
3972 if (after(th->ack_seq, sk->acked_seq))
3973 {3974 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3975 if (newwindow < 0)
3976 newwindow = 0;
3977 sk->window = newwindow;
3978 sk->acked_seq = th->ack_seq;
3979 }3980 skb->acked = 1;
3981
3982 /*3983 * When we ack the fin, we do the FIN 3984 * processing.3985 */3986
3987 if (skb->h.th->fin)
3988 {3989 tcp_fin(skb,sk,skb->h.th);
3990 }3991
3992 for(skb2 = skb->next;
3993 skb2 != (structsk_buff *)&sk->receive_queue;
3994 skb2 = skb2->next)
3995 {3996 if (before(skb2->h.th->seq, sk->acked_seq+1))
3997 {3998 if (after(skb2->h.th->ack_seq, sk->acked_seq))
3999 {4000 newwindow = sk->window -
4001 (skb2->h.th->ack_seq - sk->acked_seq);
4002 if (newwindow < 0)
4003 newwindow = 0;
4004 sk->window = newwindow;
4005 sk->acked_seq = skb2->h.th->ack_seq;
4006 }4007 skb2->acked = 1;
4008 /*4009 * When we ack the fin, we do4010 * the fin handling.4011 */4012 if (skb2->h.th->fin)
4013 {4014 tcp_fin(skb,sk,skb->h.th);
4015 }4016
4017 /*4018 * Force an immediate ack.4019 */4020
4021 sk->ack_backlog = sk->max_ack_backlog;
4022 }4023 else4024 {4025 break;
4026 }4027 }4028
4029 /*4030 * This also takes care of updating the window.4031 * This if statement needs to be simplified.4032 */4033 if (!sk->delay_acks ||
4034 sk->ack_backlog >= sk->max_ack_backlog ||
4035 sk->bytes_rcv > sk->max_unacked || th->fin) {4036 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4037 }4038 else4039 {4040 sk->ack_backlog++;
4041 if(sk->debug)
4042 printk("Ack queued.\n");
4043 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4044 }4045 }4046 }4047
4048 /*4049 * If we've missed a packet, send an ack.4050 * Also start a timer to send another.4051 */4052
4053 if (!skb->acked)
4054 {4055
4056 /*4057 * This is important. If we don't have much room left,4058 * we need to throw out a few packets so we have a good4059 * window. Note that mtu is used, not mss, because mss is really4060 * for the send side. He could be sending us stuff as large as mtu.4061 */4062
4063 while (sk->prot->rspace(sk) < sk->mtu)
4064 {4065 skb1 = skb_peek(&sk->receive_queue);
4066 if (skb1 == NULL)
4067 {4068 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4069 break;
4070 }4071
4072 /*4073 * Don't throw out something that has been acked. 4074 */4075
4076 if (skb1->acked)
4077 {4078 break;
4079 }4080
4081 skb_unlink(skb1);
4082 kfree_skb(skb1, FREE_READ);
4083 }4084 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4085 sk->ack_backlog++;
4086 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4087 }4088 else4089 {4090 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4091 }4092
4093 /*4094 * Now tell the user we may have some data. 4095 */4096
4097 if (!sk->dead)
4098 {4099 if(sk->debug)
4100 printk("Data wakeup.\n");
4101 sk->data_ready(sk,0);
4102 }4103 return(0);
4104 }4105
4106
4107 /*4108 * This routine is only called when we have urgent data4109 * signalled. Its the 'slow' part of tcp_urg. It could be4110 * moved inline now as tcp_urg is only called from one4111 * place. We handle URGent data wrong. We have to - as4112 * BSD still doesn't use the correction from RFC961.4113 */4114
4115 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4116 {4117 unsignedlongptr = ntohs(th->urg_ptr);
4118
4119 if (ptr)
4120 ptr--;
4121 ptr += th->seq;
4122
4123 /* ignore urgent data that we've already seen and read */4124 if (after(sk->copied_seq, ptr))
4125 return;
4126
4127 /* do we already have a newer (or duplicate) urgent pointer? */4128 if (sk->urg_data && !after(ptr, sk->urg_seq))
4129 return;
4130
4131 /* tell the world about our new urgent pointer */4132 if (sk->proc != 0) {4133 if (sk->proc > 0) {4134 kill_proc(sk->proc, SIGURG, 1);
4135 }else{4136 kill_pg(-sk->proc, SIGURG, 1);
4137 }4138 }4139 sk->urg_data = URG_NOTYET;
4140 sk->urg_seq = ptr;
4141 }4142
4143 /*4144 * This is the 'fast' part of urgent handling.4145 */4146
4147 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4148 unsignedlongsaddr, unsignedlonglen)
4149 {4150 unsignedlongptr;
4151
4152 /*4153 * Check if we get a new urgent pointer - normally not 4154 */4155
4156 if (th->urg)
4157 tcp_check_urg(sk,th);
4158
4159 /*4160 * Do we wait for any urgent data? - normally not4161 */4162
4163 if (sk->urg_data != URG_NOTYET)
4164 return 0;
4165
4166 /*4167 * Is the urgent pointer pointing into this packet? 4168 */4169
4170 ptr = sk->urg_seq - th->seq + th->doff*4;
4171 if (ptr >= len)
4172 return 0;
4173
4174 /*4175 * Ok, got the correct packet, update info 4176 */4177
4178 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4179 if (!sk->dead)
4180 sk->data_ready(sk,0);
4181 return 0;
4182 }4183
4184 /*4185 * This will accept the next outstanding connection. 4186 */4187
4188 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4189 {4190 structsock *newsk;
4191 structsk_buff *skb;
4192
4193 /*4194 * We need to make sure that this socket is listening,4195 * and that it has something pending.4196 */4197
4198 if (sk->state != TCP_LISTEN)
4199 {4200 sk->err = EINVAL;
4201 return(NULL);
4202 }4203
4204 /* Avoid the race. */4205 cli();
4206 sk->inuse = 1;
4207
4208 while((skb = tcp_dequeue_established(sk)) == NULL)
4209 {4210 if (flags & O_NONBLOCK)
4211 {4212 sti();
4213 release_sock(sk);
4214 sk->err = EAGAIN;
4215 return(NULL);
4216 }4217
4218 release_sock(sk);
4219 interruptible_sleep_on(sk->sleep);
4220 if (current->signal & ~current->blocked)
4221 {4222 sti();
4223 sk->err = ERESTARTSYS;
4224 return(NULL);
4225 }4226 sk->inuse = 1;
4227 }4228 sti();
4229
4230 /*4231 * Now all we need to do is return skb->sk. 4232 */4233
4234 newsk = skb->sk;
4235
4236 kfree_skb(skb, FREE_READ);
4237 sk->ack_backlog--;
4238 release_sock(sk);
4239 return(newsk);
4240 }4241
4242
4243 /*4244 * This will initiate an outgoing connection. 4245 */4246
4247 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4248 {4249 structsk_buff *buff;
4250 structdevice *dev=NULL;
4251 unsignedchar *ptr;
4252 inttmp;
4253 intatype;
4254 structtcphdr *t1;
4255 structrtable *rt;
4256
4257 if (sk->state != TCP_CLOSE)
4258 {4259 return(-EISCONN);
4260 }4261
4262 if (addr_len < 8)
4263 return(-EINVAL);
4264
4265 if (usin->sin_family && usin->sin_family != AF_INET)
4266 return(-EAFNOSUPPORT);
4267
4268 /*4269 * connect() to INADDR_ANY means loopback (BSD'ism).4270 */4271
4272 if(usin->sin_addr.s_addr==INADDR_ANY)
4273 usin->sin_addr.s_addr=ip_my_addr();
4274
4275 /*4276 * Don't want a TCP connection going to a broadcast address 4277 */4278
4279 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4280 return -ENETUNREACH;
4281
4282 sk->inuse = 1;
4283 sk->daddr = usin->sin_addr.s_addr;
4284 sk->write_seq = jiffies * SEQ_TICK - seq_offset;
4285 sk->window_seq = sk->write_seq;
4286 sk->rcv_ack_seq = sk->write_seq -1;
4287 sk->err = 0;
4288 sk->dummy_th.dest = usin->sin_port;
4289 release_sock(sk);
4290
4291 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4292 if (buff == NULL)
4293 {4294 return(-ENOMEM);
4295 }4296 sk->inuse = 1;
4297 buff->len = 24;
4298 buff->sk = sk;
4299 buff->free = 0;
4300 buff->localroute = sk->localroute;
4301
4302 t1 = (structtcphdr *) buff->data;
4303
4304 /*4305 * Put in the IP header and routing stuff. 4306 */4307
4308 rt=ip_rt_route(sk->daddr, NULL, NULL);
4309
4310
4311 /*4312 * We need to build the routing stuff from the things saved in skb. 4313 */4314
4315 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4316 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4317 if (tmp < 0)
4318 {4319 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4320 release_sock(sk);
4321 return(-ENETUNREACH);
4322 }4323
4324 buff->len += tmp;
4325 t1 = (structtcphdr *)((char *)t1 +tmp);
4326
4327 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4328 t1->seq = ntohl(sk->write_seq++);
4329 sk->sent_seq = sk->write_seq;
4330 buff->h.seq = sk->write_seq;
4331 t1->ack = 0;
4332 t1->window = 2;
4333 t1->res1=0;
4334 t1->res2=0;
4335 t1->rst = 0;
4336 t1->urg = 0;
4337 t1->psh = 0;
4338 t1->syn = 1;
4339 t1->urg_ptr = 0;
4340 t1->doff = 6;
4341 /* use 512 or whatever user asked for */4342
4343 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4344 sk->window_clamp=rt->rt_window;
4345 else4346 sk->window_clamp=0;
4347
4348 if (sk->user_mss)
4349 sk->mtu = sk->user_mss;
4350 elseif(rt!=NULL && (rt->rt_flags&RTF_MTU))
4351 sk->mtu = rt->rt_mss;
4352 else4353 {4354 #ifdefCONFIG_INET_SNARL4355 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4356 #else4357 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4358 #endif4359 sk->mtu = 576 - HEADER_SIZE;
4360 else4361 sk->mtu = MAX_WINDOW;
4362 }4363 /*4364 * but not bigger than device MTU 4365 */4366
4367 if(sk->mtu <32)
4368 sk->mtu = 32; /* Sanity limit */4369
4370 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4371
4372 /*4373 * Put in the TCP options to say MTU. 4374 */4375
4376 ptr = (unsignedchar *)(t1+1);
4377 ptr[0] = 2;
4378 ptr[1] = 4;
4379 ptr[2] = (sk->mtu) >> 8;
4380 ptr[3] = (sk->mtu) & 0xff;
4381 tcp_send_check(t1, sk->saddr, sk->daddr,
4382 sizeof(structtcphdr) + 4, sk);
4383
4384 /*4385 * This must go first otherwise a really quick response will get reset. 4386 */4387
4388 tcp_set_state(sk,TCP_SYN_SENT);
4389 sk->rto = TCP_TIMEOUT_INIT;
4390 init_timer(&sk->retransmit_timer);
4391 sk->retransmit_timer.function=&retransmit_timer;
4392 sk->retransmit_timer.data = (unsignedlong)sk;
4393 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4394 sk->retransmits = TCP_SYN_RETRIES;
4395
4396 sk->prot->queue_xmit(sk, dev, buff, 0);
4397 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4398 tcp_statistics.TcpActiveOpens++;
4399 tcp_statistics.TcpOutSegs++;
4400
4401 release_sock(sk);
4402 return(0);
4403 }4404
4405
4406 /* This functions checks to see if the tcp header is actually acceptable. */4407 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4408 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4409 {4410 unsignedlongnext_seq;
4411
4412 next_seq = len - 4*th->doff;
4413 if (th->fin)
4414 next_seq++;
4415 /* if we have a zero window, we can't have any data in the packet.. */4416 if (next_seq && !sk->window)
4417 gotoignore_it;
4418 next_seq += th->seq;
4419
4420 /*4421 * This isn't quite right. sk->acked_seq could be more recent4422 * than sk->window. This is however close enough. We will accept4423 * slightly more packets than we should, but it should not cause4424 * problems unless someone is trying to forge packets.4425 */4426
4427 /* have we already seen all of this packet? */4428 if (!after(next_seq+1, sk->acked_seq))
4429 gotoignore_it;
4430 /* or does it start beyond the window? */4431 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4432 gotoignore_it;
4433
4434 /* ok, at least part of this packet would seem interesting.. */4435 return 1;
4436
4437 ignore_it:
4438 if (th->rst)
4439 return 0;
4440
4441 /*4442 * Send a reset if we get something not ours and we are4443 * unsynchronized. Note: We don't do anything to our end. We4444 * are just killing the bogus remote connection then we will4445 * connect again and it will work (with luck).4446 */4447
4448 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4449 {4450 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4451 return 1;
4452 }4453
4454 /* Try to resync things. */4455 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4456 return 0;
4457 }4458
4459 /*4460 * When we get a reset we do this.4461 */4462
4463 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4464 {4465 sk->zapped = 1;
4466 sk->err = ECONNRESET;
4467 if (sk->state == TCP_SYN_SENT)
4468 sk->err = ECONNREFUSED;
4469 if (sk->state == TCP_CLOSE_WAIT)
4470 sk->err = EPIPE;
4471 #ifdef TCP_DO_RFC1337
4472 /*4473 * Time wait assassination protection [RFC1337]4474 */4475 if(sk->state!=TCP_TIME_WAIT)
4476 {4477 tcp_set_state(sk,TCP_CLOSE);
4478 sk->shutdown = SHUTDOWN_MASK;
4479 }4480 #else4481 tcp_set_state(sk,TCP_CLOSE);
4482 sk->shutdown = SHUTDOWN_MASK;
4483 #endif4484 if (!sk->dead)
4485 sk->state_change(sk);
4486 kfree_skb(skb, FREE_READ);
4487 release_sock(sk);
4488 return(0);
4489 }4490
4491 /*4492 * A TCP packet has arrived.4493 */4494
4495 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4496 unsignedlongdaddr, unsignedshortlen,
4497 unsignedlongsaddr, intredo, structinet_protocol * protocol)
4498 {4499 structtcphdr *th;
4500 structsock *sk;
4501 intsyn_ok=0;
4502
4503 if (!skb)
4504 {4505 printk("IMPOSSIBLE 1\n");
4506 return(0);
4507 }4508
4509 if (!dev)
4510 {4511 printk("IMPOSSIBLE 2\n");
4512 return(0);
4513 }4514
4515 tcp_statistics.TcpInSegs++;
4516
4517 if(skb->pkt_type!=PACKET_HOST)
4518 {4519 kfree_skb(skb,FREE_READ);
4520 return(0);
4521 }4522
4523 th = skb->h.th;
4524
4525 /*4526 * Find the socket.4527 */4528
4529 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4530
4531 /*4532 * If this socket has got a reset its to all intents and purposes 4533 * really dead. Count closed sockets as dead.4534 *4535 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4536 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4537 * exist so should cause resets as if the port was unreachable.4538 */4539
4540 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4541 sk=NULL;
4542
4543 if (!redo)
4544 {4545 if (tcp_check(th, len, saddr, daddr ))
4546 {4547 skb->sk = NULL;
4548 kfree_skb(skb,FREE_READ);
4549 /*4550 * We don't release the socket because it was4551 * never marked in use.4552 */4553 return(0);
4554 }4555 th->seq = ntohl(th->seq);
4556
4557 /* See if we know about the socket. */4558 if (sk == NULL)
4559 {4560 /*4561 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4562 */4563 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4564 skb->sk = NULL;
4565 /*4566 * Discard frame4567 */4568 kfree_skb(skb, FREE_READ);
4569 return(0);
4570 }4571
4572 skb->len = len;
4573 skb->acked = 0;
4574 skb->used = 0;
4575 skb->free = 0;
4576 skb->saddr = daddr;
4577 skb->daddr = saddr;
4578
4579 /* We may need to add it to the backlog here. */4580 cli();
4581 if (sk->inuse)
4582 {4583 skb_queue_tail(&sk->back_log, skb);
4584 sti();
4585 return(0);
4586 }4587 sk->inuse = 1;
4588 sti();
4589 }4590 else4591 {4592 if (sk==NULL)
4593 {4594 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4595 skb->sk = NULL;
4596 kfree_skb(skb, FREE_READ);
4597 return(0);
4598 }4599 }4600
4601
4602 if (!sk->prot)
4603 {4604 printk("IMPOSSIBLE 3\n");
4605 return(0);
4606 }4607
4608
4609 /*4610 * Charge the memory to the socket. 4611 */4612
4613 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
4614 {4615 kfree_skb(skb, FREE_READ);
4616 release_sock(sk);
4617 return(0);
4618 }4619
4620 skb->sk=sk;
4621 sk->rmem_alloc += skb->mem_len;
4622
4623 /*4624 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4625 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4626 * compatibility. We also set up variables more thoroughly [Karn notes in the4627 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4628 */4629
4630 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4631 {4632
4633 /*4634 * Now deal with unusual cases.4635 */4636
4637 if(sk->state==TCP_LISTEN)
4638 {4639 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4640 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4641
4642 /*4643 * We don't care for RST, and non SYN are absorbed (old segments)4644 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4645 * netmask on a running connection it can go broadcast. Even Sun's have4646 * this problem so I'm ignoring it 4647 */4648
4649 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4650 {4651 kfree_skb(skb, FREE_READ);
4652 release_sock(sk);
4653 return 0;
4654 }4655
4656 /* 4657 * Guess we need to make a new socket up 4658 */4659
4660 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4661
4662 /*4663 * Now we have several options: In theory there is nothing else4664 * in the frame. KA9Q has an option to send data with the syn,4665 * BSD accepts data with the syn up to the [to be] advertised window4666 * and Solaris 2.1 gives you a protocol error. For now we just ignore4667 * it, that fits the spec precisely and avoids incompatibilities. It4668 * would be nice in future to drop through and process the data.4669 */4670
4671 release_sock(sk);
4672 return 0;
4673 }4674
4675 /* retransmitted SYN? */4676 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4677 {4678 kfree_skb(skb, FREE_READ);
4679 release_sock(sk);
4680 return 0;
4681 }4682
4683 /*4684 * SYN sent means we have to look for a suitable ack and either reset4685 * for bad matches or go to connected 4686 */4687
4688 if(sk->state==TCP_SYN_SENT)
4689 {4690 /* Crossed SYN or previous junk segment */4691 if(th->ack)
4692 {4693 /* We got an ack, but its not a good ack */4694 if(!tcp_ack(sk,th,saddr,len))
4695 {4696 /* Reset the ack - its an ack from a 4697 different connection [ th->rst is checked in tcp_reset()] */4698 tcp_statistics.TcpAttemptFails++;
4699 tcp_reset(daddr, saddr, th,
4700 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4701 kfree_skb(skb, FREE_READ);
4702 release_sock(sk);
4703 return(0);
4704 }4705 if(th->rst)
4706 returntcp_std_reset(sk,skb);
4707 if(!th->syn)
4708 {4709 /* A valid ack from a different connection4710 start. Shouldn't happen but cover it */4711 kfree_skb(skb, FREE_READ);
4712 release_sock(sk);
4713 return 0;
4714 }4715 /*4716 * Ok.. its good. Set up sequence numbers and4717 * move to established.4718 */4719 syn_ok=1; /* Don't reset this connection for the syn */4720 sk->acked_seq=th->seq+1;
4721 sk->fin_seq=th->seq;
4722 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4723 tcp_set_state(sk, TCP_ESTABLISHED);
4724 tcp_options(sk,th);
4725 sk->dummy_th.dest=th->source;
4726 sk->copied_seq = sk->acked_seq;
4727 if(!sk->dead)
4728 {4729 sk->state_change(sk);
4730 sock_wake_async(sk->socket, 0);
4731 }4732 if(sk->max_window==0)
4733 {4734 sk->max_window = 32;
4735 sk->mss = min(sk->max_window, sk->mtu);
4736 }4737 }4738 else4739 {4740 /* See if SYN's cross. Drop if boring */4741 if(th->syn && !th->rst)
4742 {4743 /* Crossed SYN's are fine - but talking to4744 yourself is right out... */4745 if(sk->saddr==saddr && sk->daddr==daddr &&
4746 sk->dummy_th.source==th->source &&
4747 sk->dummy_th.dest==th->dest)
4748 {4749 tcp_statistics.TcpAttemptFails++;
4750 returntcp_std_reset(sk,skb);
4751 }4752 tcp_set_state(sk,TCP_SYN_RECV);
4753
4754 /*4755 * FIXME:4756 * Must send SYN|ACK here4757 */4758 }4759 /* Discard junk segment */4760 kfree_skb(skb, FREE_READ);
4761 release_sock(sk);
4762 return 0;
4763 }4764 /*4765 * SYN_RECV with data maybe.. drop through4766 */4767 gotorfc_step6;
4768 }4769
4770 /*4771 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is4772 * a more complex suggestion for fixing these reuse issues in RFC16444773 * but not yet ready for general use. Also see RFC1379.4774 */4775
4776 #defineBSD_TIME_WAIT4777 #ifdefBSD_TIME_WAIT4778 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4779 after(th->seq, sk->acked_seq) && !th->rst)
4780 {4781 longseq=sk->write_seq;
4782 if(sk->debug)
4783 printk("Doing a BSD time wait\n");
4784 tcp_statistics.TcpEstabResets++;
4785 sk->rmem_alloc -= skb->mem_len;
4786 skb->sk = NULL;
4787 sk->err=ECONNRESET;
4788 tcp_set_state(sk, TCP_CLOSE);
4789 sk->shutdown = SHUTDOWN_MASK;
4790 release_sock(sk);
4791 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4792 if (sk && sk->state==TCP_LISTEN)
4793 {4794 sk->inuse=1;
4795 skb->sk = sk;
4796 sk->rmem_alloc += skb->mem_len;
4797 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4798 release_sock(sk);
4799 return 0;
4800 }4801 kfree_skb(skb, FREE_READ);
4802 return 0;
4803 }4804 #endif4805 }4806
4807 /*4808 * We are now in normal data flow (see the step list in the RFC)4809 * Note most of these are inline now. I'll inline the lot when4810 * I have time to test it hard and look at what gcc outputs 4811 */4812
4813 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4814 {4815 kfree_skb(skb, FREE_READ);
4816 release_sock(sk);
4817 return 0;
4818 }4819
4820 if(th->rst)
4821 returntcp_std_reset(sk,skb);
4822
4823 /*4824 * !syn_ok is effectively the state test in RFC793.4825 */4826
4827 if(th->syn && !syn_ok)
4828 {4829 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4830 returntcp_std_reset(sk,skb);
4831 }4832
4833 /*4834 * Process the ACK4835 */4836
4837
4838 if(th->ack && !tcp_ack(sk,th,saddr,len))
4839 {4840 /*4841 * Our three way handshake failed.4842 */4843
4844 if(sk->state==TCP_SYN_RECV)
4845 {4846 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4847 }4848 kfree_skb(skb, FREE_READ);
4849 release_sock(sk);
4850 return 0;
4851 }4852
4853 rfc_step6: /* I'll clean this up later */4854
4855 /*4856 * Process urgent data4857 */4858
4859 if(tcp_urg(sk, th, saddr, len))
4860 {4861 kfree_skb(skb, FREE_READ);
4862 release_sock(sk);
4863 return 0;
4864 }4865
4866
4867 /*4868 * Process the encapsulated data4869 */4870
4871 if(tcp_data(skb,sk, saddr, len))
4872 {4873 kfree_skb(skb, FREE_READ);
4874 release_sock(sk);
4875 return 0;
4876 }4877
4878 /*4879 * And done4880 */4881
4882 release_sock(sk);
4883 return 0;
4884 }4885
4886 /*4887 * This routine sends a packet with an out of date sequence4888 * number. It assumes the other end will try to ack it.4889 */4890
4891 staticvoidtcp_write_wakeup(structsock *sk)
/* */4892 {4893 structsk_buff *buff;
4894 structtcphdr *t1;
4895 structdevice *dev=NULL;
4896 inttmp;
4897
4898 if (sk->zapped)
4899 return; /* After a valid reset we can send no more */4900
4901 /*4902 * Write data can still be transmitted/retransmitted in the4903 * following states. If any other state is encountered, return.4904 * [listen/close will never occur here anyway]4905 */4906
4907 if (sk->state != TCP_ESTABLISHED &&
4908 sk->state != TCP_CLOSE_WAIT &&
4909 sk->state != TCP_FIN_WAIT1 &&
4910 sk->state != TCP_LAST_ACK &&
4911 sk->state != TCP_CLOSING4912 )
4913 {4914 return;
4915 }4916
4917 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4918 if (buff == NULL)
4919 return;
4920
4921 buff->len = sizeof(structtcphdr);
4922 buff->free = 1;
4923 buff->sk = sk;
4924 buff->localroute = sk->localroute;
4925
4926 t1 = (structtcphdr *) buff->data;
4927
4928 /* Put in the IP header and routing stuff. */4929 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4930 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4931 if (tmp < 0)
4932 {4933 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4934 return;
4935 }4936
4937 buff->len += tmp;
4938 t1 = (structtcphdr *)((char *)t1 +tmp);
4939
4940 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4941
4942 /*4943 * Use a previous sequence.4944 * This should cause the other end to send an ack.4945 */4946
4947 t1->seq = htonl(sk->sent_seq-1);
4948 t1->ack = 1;
4949 t1->res1= 0;
4950 t1->res2= 0;
4951 t1->rst = 0;
4952 t1->urg = 0;
4953 t1->psh = 0;
4954 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */4955 t1->syn = 0;
4956 t1->ack_seq = ntohl(sk->acked_seq);
4957 t1->window = ntohs(tcp_select_window(sk));
4958 t1->doff = sizeof(*t1)/4;
4959 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4960 /*4961 * Send it and free it.4962 * This will prevent the timer from automatically being restarted.4963 */4964 sk->prot->queue_xmit(sk, dev, buff, 1);
4965 tcp_statistics.TcpOutSegs++;
4966 }4967
4968 /*4969 * A window probe timeout has occurred.4970 */4971
4972 voidtcp_send_probe0(structsock *sk)
/* */4973 {4974 if (sk->zapped)
4975 return; /* After a valid reset we can send no more */4976
4977 tcp_write_wakeup(sk);
4978
4979 sk->backoff++;
4980 sk->rto = min(sk->rto << 1, 120*HZ);
4981 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4982 sk->retransmits++;
4983 sk->prot->retransmits ++;
4984 }4985
4986 /*4987 * Socket option code for TCP. 4988 */4989
4990 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */4991 {4992 intval,err;
4993
4994 if(level!=SOL_TCP)
4995 returnip_setsockopt(sk,level,optname,optval,optlen);
4996
4997 if (optval == NULL)
4998 return(-EINVAL);
4999
5000 err=verify_area(VERIFY_READ, optval, sizeof(int));
5001 if(err)
5002 returnerr;
5003
5004 val = get_fs_long((unsignedlong *)optval);
5005
5006 switch(optname)
5007 {5008 caseTCP_MAXSEG:
5009 /*5010 * values greater than interface MTU won't take effect. however at5011 * the point when this call is done we typically don't yet know5012 * which interface is going to be used5013 */5014 if(val<1||val>MAX_WINDOW)
5015 return -EINVAL;
5016 sk->user_mss=val;
5017 return 0;
5018 caseTCP_NODELAY:
5019 sk->nonagle=(val==0)?0:1;
5020 return 0;
5021 default:
5022 return(-ENOPROTOOPT);
5023 }5024 }5025
5026 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5027 {5028 intval,err;
5029
5030 if(level!=SOL_TCP)
5031 returnip_getsockopt(sk,level,optname,optval,optlen);
5032
5033 switch(optname)
5034 {5035 caseTCP_MAXSEG:
5036 val=sk->user_mss;
5037 break;
5038 caseTCP_NODELAY:
5039 val=sk->nonagle;
5040 break;
5041 default:
5042 return(-ENOPROTOOPT);
5043 }5044 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5045 if(err)
5046 returnerr;
5047 put_fs_long(sizeof(int),(unsignedlong *) optlen);
5048
5049 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5050 if(err)
5051 returnerr;
5052 put_fs_long(val,(unsignedlong *)optval);
5053
5054 return(0);
5055 }5056
5057
5058 structprototcp_prot = {5059 sock_wmalloc,
5060 sock_rmalloc,
5061 sock_wfree,
5062 sock_rfree,
5063 sock_rspace,
5064 sock_wspace,
5065 tcp_close,
5066 tcp_read,
5067 tcp_write,
5068 tcp_sendto,
5069 tcp_recvfrom,
5070 ip_build_header,
5071 tcp_connect,
5072 tcp_accept,
5073 ip_queue_xmit,
5074 tcp_retransmit,
5075 tcp_write_wakeup,
5076 tcp_read_wakeup,
5077 tcp_rcv,
5078 tcp_select,
5079 tcp_ioctl,
5080 NULL,
5081 tcp_shutdown,
5082 tcp_setsockopt,
5083 tcp_getsockopt,
5084 128,
5085 0,
5086 {NULL,},
5087 "TCP",
5088 0, 0
5089 };