1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@no.unit.nvg> 20 * 21 * Fixes: 22 * Alan Cox : Numerous verify_area() calls 23 * Alan Cox : Set the ACK bit on a reset 24 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 25 * and was trying to connect (tcp_err()). 26 * Alan Cox : All icmp error handling was broken 27 * pointers passed where wrong and the 28 * socket was looked up backwards. Nobody 29 * tested any icmp error code obviously. 30 * Alan Cox : tcp_err() now handled properly. It wakes people 31 * on errors. select behaves and the icmp error race 32 * has gone by moving it into sock.c 33 * Alan Cox : tcp_reset() fixed to work for everything not just 34 * packets for unknown sockets. 35 * Alan Cox : tcp option processing. 36 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] 37 * Herp Rosmanith : More reset fixes 38 * Alan Cox : No longer acks invalid rst frames. Acking 39 * any kind of RST is right out. 40 * Alan Cox : Sets an ignore me flag on an rst receive 41 * otherwise odd bits of prattle escape still 42 * Alan Cox : Fixed another acking RST frame bug. Should stop 43 * LAN workplace lockups. 44 * Alan Cox : Some tidyups using the new skb list facilities 45 * Alan Cox : sk->keepopen now seems to work 46 * Alan Cox : Pulls options out correctly on accepts 47 * Alan Cox : Fixed assorted sk->rqueue->next errors 48 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. 49 * Alan Cox : Tidied tcp_data to avoid a potential nasty. 50 * Alan Cox : Added some better commenting, as the tcp is hard to follow 51 * Alan Cox : Removed incorrect check for 20 * psh 52 * Michael O'Reilly : ack < copied bug fix. 53 * Johannes Stille : Misc tcp fixes (not all in yet). 54 * Alan Cox : FIN with no memory -> CRASH 55 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. 56 * Alan Cox : Added TCP options (SOL_TCP) 57 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. 58 * Alan Cox : Use ip_tos/ip_ttl settings. 59 * Alan Cox : Handle FIN (more) properly (we hope). 60 * Alan Cox : RST frames sent on unsynchronised state ack error/ 61 * Alan Cox : Put in missing check for SYN bit. 62 * Alan Cox : Added tcp_select_window() aka NET2E 63 * window non shrink trick. 64 * Alan Cox : Added a couple of small NET2E timer fixes 65 * Charles Hedrick : TCP fixes 66 * Toomas Tamm : TCP window fixes 67 * Alan Cox : Small URG fix to rlogin ^C ack fight 68 * Charles Hedrick : Rewrote most of it to actually work 69 * Linus : Rewrote tcp_read() and URG handling 70 * completely 71 * Gerhard Koerting: Fixed some missing timer handling 72 * Matthew Dillon : Reworked TCP machine states as per RFC 73 * Gerhard Koerting: PC/TCP workarounds 74 * Adam Caldwell : Assorted timer/timing errors 75 * Matthew Dillon : Fixed another RST bug 76 * Alan Cox : Move to kernel side addressing changes. 77 * Alan Cox : Beginning work on TCP fastpathing (not yet usable) 78 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 79 * Alan Cox : TCP fast path debugging 80 * Alan Cox : Window clamping 81 * Michael Riepe : Bug in tcp_check() 82 * Matt Dillon : More TCP improvements and RST bug fixes 83 * Matt Dillon : Yet more small nasties remove from the TCP code 84 * (Be very nice to this man if tcp finally works 100%) 8) 85 * Alan Cox : BSD accept semantics. 86 * Alan Cox : Reset on closedown bug. 87 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 88 * Michael Pall : Handle select() after URG properly in all cases. 89 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). 90 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. 91 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. 92 * Alan Cox : Changed the semantics of sk->socket to 93 * fix a race and a signal problem with 94 * accept() and async I/O. 95 * Alan Cox : Relaxed the rules on tcp_sendto(). 96 * Yury Shevchuk : Really fixed accept() blocking problem. 97 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 98 * clients/servers which listen in on 99 * fixed ports. 100 * Alan Cox : Cleaned the above up and shrank it to 101 * a sensible code size. 102 * Alan Cox : Self connect lockup fix. 103 * Alan Cox : No connect to multicast. 104 * Ross Biro : Close unaccepted children on master 105 * socket close. 106 * Alan Cox : Reset tracing code. 107 * Alan Cox : Spurious resets on shutdown. 108 * Alan Cox : Giant 15 minute/60 second timer error 109 * Alan Cox : Small whoops in selecting before an accept. 110 * Alan Cox : Kept the state trace facility since its 111 * handy for debugging. 112 * Alan Cox : More reset handler fixes. 113 * Alan Cox : Started rewriting the code based on the RFC's 114 * for other useful protocol references see: 115 * Comer, KA9Q NOS, and for a reference on the 116 * difference between specifications and how BSD 117 * works see the 4.4lite source. 118 * A.N.Kuznetsov : Don't time wait on completion of tidy 119 * close. 120 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 121 * Linus Torvalds : Fixed BSD port reuse to work first syn 122 * Alan Cox : Reimplemented timers as per the RFC and using multiple 123 * timers for sanity. 124 * Alan Cox : Small bug fixes, and a lot of new 125 * comments. 126 * Alan Cox : Fixed dual reader crash by locking 127 * the buffers (much like datagram.c) 128 * Alan Cox : Fixed stuck sockets in probe. A probe 129 * now gets fed up of retrying without 130 * (even a no space) answer. 131 * Alan Cox : Extracted closing code better 132 * Alan Cox : Fixed the closing state machine to 133 * resemble the RFC. 134 * Alan Cox : More 'per spec' fixes. 135 * 136 * 137 * To Fix: 138 * Fast path the code. Two things here - fix the window calculation 139 * so it doesn't iterate over the queue, also spot packets with no funny 140 * options arriving in order and process directly. 141 * 142 * Implement RFC 1191 [Path MTU discovery] 143 * Look at the effect of implementing RFC 1337 suggestions and their impact. 144 * Rewrite output state machine to use a single queue and do low window 145 * situations as per the spec (RFC 1122) 146 * Speed up input assembly algorithm. 147 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 148 * could do with it working on IPv4 149 * User settable/learned rtt/max window/mtu 150 * Cope with MTU/device switches when retransmitting in tcp. 151 * Fix the window handling to use PR's new code. 152 * 153 * Change the fundamental structure to a single send queue maintained 154 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 155 * active routes too]). Cut the queue off in tcp_retransmit/ 156 * tcp_transmit. 157 * Change the receive queue to assemble as it goes. This lets us 158 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 159 * tcp_data/tcp_read as well as the window shrink crud. 160 * Seperate out duplicated code - tcp_alloc_skb, tcp_build_ack 161 * tcp_queue_skb seem obvious routines to extract. 162 * 163 * This program is free software; you can redistribute it and/or 164 * modify it under the terms of the GNU General Public License 165 * as published by the Free Software Foundation; either version 166 * 2 of the License, or(at your option) any later version. 167 * 168 * Description of States: 169 * 170 * TCP_SYN_SENT sent a connection request, waiting for ack 171 * 172 * TCP_SYN_RECV received a connection request, sent ack, 173 * waiting for final ack in three-way handshake. 174 * 175 * TCP_ESTABLISHED connection established 176 * 177 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 178 * transmission of remaining buffered data 179 * 180 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 181 * to shutdown 182 * 183 * TCP_CLOSING both sides have shutdown but we still have 184 * data we have to finish sending 185 * 186 * TCP_TIME_WAIT timeout to catch resent junk before entering 187 * closed, can only be entered from FIN_WAIT2 188 * or CLOSING. Required because the other end 189 * may not have gotten our last ACK causing it 190 * to retransmit the data packet (which we ignore) 191 * 192 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 193 * us to finish writing our data and to shutdown 194 * (we have to close() to move on to LAST_ACK) 195 * 196 * TCP_LAST_ACK out side has shutdown after remote has 197 * shutdown. There may still be data in our 198 * buffer that we have to finish sending 199 * 200 * TCP_CLOSE socket is finished 201 */ 202
203 #include <linux/types.h>
204 #include <linux/sched.h>
205 #include <linux/mm.h>
206 #include <linux/time.h>
207 #include <linux/string.h>
208 #include <linux/config.h>
209 #include <linux/socket.h>
210 #include <linux/sockios.h>
211 #include <linux/termios.h>
212 #include <linux/in.h>
213 #include <linux/fcntl.h>
214 #include <linux/inet.h>
215 #include <linux/netdevice.h>
216 #include "snmp.h"
217 #include "ip.h"
218 #include "protocol.h"
219 #include "icmp.h"
220 #include "tcp.h"
221 #include "arp.h"
222 #include <linux/skbuff.h>
223 #include "sock.h"
224 #include "route.h"
225 #include <linux/errno.h>
226 #include <linux/timer.h>
227 #include <asm/system.h>
228 #include <asm/segment.h>
229 #include <linux/mm.h>
230
231 /* 232 * The MSL timer is the 'normal' timer. 233 */ 234
235 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
236
237 #defineSEQ_TICK 3
238 unsignedlongseq_offset;
239 structtcp_mibtcp_statistics;
240
241 staticvoidtcp_close(structsock *sk, inttimeout);
242
243
244 /* 245 * The less said about this the better, but it works and will do for 1.2 246 */ 247
248 staticstructwait_queue *master_select_wakeup;
249
250 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 251 { 252 if (a < b)
253 return(a);
254 return(b);
255 } 256
257 #undefSTATE_TRACE 258
259 #ifdefSTATE_TRACE 260 staticchar *statename[]={ 261 "Unused","Established","Syn Sent","Syn Recv",
262 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
263 "Close Wait","Last ACK","Listen","Closing"
264 };
265 #endif 266
267 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 268 { 269 if(sk->state==TCP_ESTABLISHED)
270 tcp_statistics.TcpCurrEstab--;
271 #ifdefSTATE_TRACE 272 if(sk->debug)
273 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
274 #endif 275 /* This is a hack but it doesn't occur often and its going to 276 be a real to fix nicely */ 277
278 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
279 { 280 wake_up_interruptible(&master_select_wakeup);
281 } 282 sk->state=state;
283 if(state==TCP_ESTABLISHED)
284 tcp_statistics.TcpCurrEstab++;
285 } 286
287 /* 288 * This routine picks a TCP windows for a socket based on 289 * the following constraints 290 * 291 * 1. The window can never be shrunk once it is offered (RFC 793) 292 * 2. We limit memory per socket 293 * 294 * For now we use NET2E3's heuristic of offering half the memory 295 * we have handy. All is not as bad as this seems however because 296 * of two things. Firstly we will bin packets even within the window 297 * in order to get the data we are waiting for into the memory limit. 298 * Secondly we bin common duplicate forms at receive time 299 * Better heuristics welcome 300 */ 301
302 inttcp_select_window(structsock *sk)
/* */ 303 { 304 intnew_window = sk->prot->rspace(sk);
305
306 if(sk->window_clamp)
307 new_window=min(sk->window_clamp,new_window);
308 /* 309 * Two things are going on here. First, we don't ever offer a 310 * window less than min(sk->mss, MAX_WINDOW/2). This is the 311 * receiver side of SWS as specified in RFC1122. 312 * Second, we always give them at least the window they 313 * had before, in order to avoid retracting window. This 314 * is technically allowed, but RFC1122 advises against it and 315 * in practice it causes trouble. 316 * 317 * Fixme: This doesn't correctly handle the case where 318 * new_window > sk->window but not by enough to allow for the 319 * shift in sequence space. 320 */ 321 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
322 return(sk->window);
323 return(new_window);
324 } 325
326 /* 327 * Find someone to 'accept'. Must be called with 328 * sk->inuse=1 or cli() 329 */ 330
331 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 332 { 333 structsk_buff *p=skb_peek(&s->receive_queue);
334 if(p==NULL)
335 returnNULL;
336 do 337 { 338 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
339 returnp;
340 p=p->next;
341 } 342 while(p!=(structsk_buff *)&s->receive_queue);
343 returnNULL;
344 } 345
346 /* 347 * Remove a completed connection and return it. This is used by 348 * tcp_accept() to get connections from the queue. 349 */ 350
351 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 352 { 353 structsk_buff *skb;
354 unsignedlongflags;
355 save_flags(flags);
356 cli();
357 skb=tcp_find_established(s);
358 if(skb!=NULL)
359 skb_unlink(skb); /* Take it off the queue */ 360 restore_flags(flags);
361 returnskb;
362 } 363
364 /* 365 * This routine closes sockets which have been at least partially 366 * opened, but not yet accepted. Currently it is only called by 367 * tcp_close, and timeout mirrors the value there. 368 */ 369
370 staticvoidtcp_close_pending (structsock *sk)
/* */ 371 { 372 structsk_buff *skb;
373
374 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { 375 tcp_close(skb->sk, 0);
376 kfree_skb(skb, FREE_READ);
377 } 378 return;
379 } 380
381 /* 382 * Enter the time wait state. 383 */ 384
385 staticvoidtcp_time_wait(structsock *sk)
/* */ 386 { 387 tcp_set_state(sk,TCP_TIME_WAIT);
388 sk->shutdown = SHUTDOWN_MASK;
389 if (!sk->dead)
390 sk->state_change(sk);
391 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
392 } 393
394 /* 395 * A socket has timed out on its send queue and wants to do a 396 * little retransmitting. Currently this means TCP. 397 */ 398
399 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 400 { 401 structsk_buff * skb;
402 structproto *prot;
403 structdevice *dev;
404 intct=0;
405
406 prot = sk->prot;
407 skb = sk->send_head;
408
409 while (skb != NULL)
410 { 411 structtcphdr *th;
412 structiphdr *iph;
413 intsize;
414
415 dev = skb->dev;
416 IS_SKB(skb);
417 skb->when = jiffies;
418
419 /* 420 * In general it's OK just to use the old packet. However we 421 * need to use the current ack and window fields. Urg and 422 * urg_ptr could possibly stand to be updated as well, but we 423 * don't keep the necessary data. That shouldn't be a problem, 424 * if the other end is doing the right thing. Since we're 425 * changing the packet, we have to issue a new IP identifier. 426 */ 427
428 iph = (structiphdr *)(skb->data + dev->hard_header_len);
429 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
430 size = skb->len - (((unsignedchar *) th) - skb->data);
431
432 /* 433 * Note: We ought to check for window limits here but 434 * currently this is done (less efficiently) elsewhere. 435 * We do need to check for a route change but can't handle 436 * that until we have the new 1.3.x buffers in. 437 * 438 */ 439
440 iph->id = htons(ip_id_count++);
441 ip_send_check(iph);
442
443 /* 444 * This is not the right way to handle this. We have to 445 * issue an up to date window and ack report with this 446 * retransmit to keep the odd buggy tcp that relies on 447 * the fact BSD does this happy. 448 * We don't however need to recalculate the entire 449 * checksum, so someone wanting a small problem to play 450 * with might like to implement RFC1141/RFC1624 and speed 451 * this up by avoiding a full checksum. 452 */ 453
454 th->ack_seq = ntohl(sk->acked_seq);
455 th->window = ntohs(tcp_select_window(sk));
456 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
457
458 /* 459 * If the interface is (still) up and running, kick it. 460 */ 461
462 if (dev->flags & IFF_UP)
463 { 464 /* 465 * If the packet is still being sent by the device/protocol 466 * below then don't retransmit. This is both needed, and good - 467 * especially with connected mode AX.25 where it stops resends 468 * occurring of an as yet unsent anyway frame! 469 * We still add up the counts as the round trip time wants 470 * adjusting. 471 */ 472 if (sk && !skb_device_locked(skb))
473 { 474 /* Remove it from any existing driver queue first! */ 475 skb_unlink(skb);
476 /* Now queue it */ 477 ip_statistics.IpOutRequests++;
478 dev_queue_xmit(skb, dev, sk->priority);
479 } 480 } 481
482 /* 483 * Count retransmissions 484 */ 485
486 ct++;
487 sk->prot->retransmits ++;
488
489 /* 490 * Only one retransmit requested. 491 */ 492
493 if (!all)
494 break;
495
496 /* 497 * This should cut it off before we send too many packets. 498 */ 499
500 if (ct >= sk->cong_window)
501 break;
502 skb = skb->link3;
503 } 504 } 505
506 /* 507 * Reset the retransmission timer 508 */ 509
510 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 511 { 512 del_timer(&sk->retransmit_timer);
513 sk->ip_xmit_timeout = why;
514 if((int)when < 0)
515 { 516 when=3;
517 printk("Error: Negative timer in xmit_timer\n");
518 } 519 sk->retransmit_timer.expires=when;
520 add_timer(&sk->retransmit_timer);
521 } 522
523 /* 524 * This is the normal code called for timeouts. It does the retransmission 525 * and then does backoff. tcp_do_retransmit is separated out because 526 * tcp_ack needs to send stuff from the retransmit queue without 527 * initiating a backoff. 528 */ 529
530
531 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 532 { 533 tcp_do_retransmit(sk, all);
534
535 /* 536 * Increase the timeout each time we retransmit. Note that 537 * we do not increase the rtt estimate. rto is initialized 538 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 539 * that doubling rto each time is the least we can get away with. 540 * In KA9Q, Karn uses this for the first few times, and then 541 * goes to quadratic. netBSD doubles, but only goes up to *64, 542 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 543 * defined in the protocol as the maximum possible RTT. I guess 544 * we'll have to use something other than TCP to talk to the 545 * University of Mars. 546 * 547 * PAWS allows us longer timeouts and large windows, so once 548 * implemented ftp to mars will work nicely. We will have to fix 549 * the 120 second clamps though! 550 */ 551
552 sk->retransmits++;
553 sk->backoff++;
554 sk->rto = min(sk->rto << 1, 120*HZ);
555 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
556 } 557
558
559 /* 560 * A timer event has trigger a tcp retransmit timeout. The 561 * socket xmit queue is ready and set up to send. Because 562 * the ack receive code keeps the queue straight we do 563 * nothing clever here. 564 */ 565
566 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 567 { 568 if (all)
569 { 570 tcp_retransmit_time(sk, all);
571 return;
572 } 573
574 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 575 /* sk->ssthresh in theory can be zero. I guess that's OK */ 576 sk->cong_count = 0;
577
578 sk->cong_window = 1;
579
580 /* Do the actual retransmit. */ 581 tcp_retransmit_time(sk, all);
582 } 583
584 /* 585 * A write timeout has occured. Process the after effects. 586 */ 587
588 staticinttcp_write_timeout(structsock *sk)
/* */ 589 { 590 /* 591 * Look for a 'soft' timeout. 592 */ 593 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
594 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
595 { 596 /* 597 * Attempt to recover if arp has changed (unlikely!) or 598 * a route has shifted (not supported prior to 1.3). 599 */ 600 arp_destroy (sk->daddr, 0);
601 ip_route_check (sk->daddr);
602 } 603 /* 604 * Has it gone just too far ? 605 */ 606 if (sk->retransmits > TCP_RETR2)
607 { 608 sk->err = ETIMEDOUT;
609 sk->error_report(sk);
610 /* 611 * Time wait the socket 612 */ 613 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING)
614 { 615 tcp_set_state(sk,TCP_TIME_WAIT);
616 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
617 } 618 else 619 { 620 /* 621 * Clean up time. 622 */ 623 tcp_set_state(sk, TCP_CLOSE);
624 return 0;
625 } 626 } 627 return 1;
628 } 629
630 /* 631 * The TCP retransmit timer. This lacks a few small details. 632 * 633 * 1. An initial rtt timeout on the probe0 should cause what we can 634 * of the first write queue buffer to be split and sent. 635 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 636 * ETIMEDOUT if we know an additional 'soft' error caused this. 637 * tcp_err should save a 'soft error' for us. 638 */ 639
640 staticvoidretransmit_timer(unsignedlongdata)
/* */ 641 { 642 structsock *sk = (structsock*)data;
643 intwhy = sk->ip_xmit_timeout;
644
645 /* 646 * only process if socket is not in use 647 */ 648
649 cli();
650 if (sk->inuse || in_bh)
651 { 652 /* Try again in 1 second */ 653 sk->retransmit_timer.expires = HZ;
654 add_timer(&sk->retransmit_timer);
655 sti();
656 return;
657 } 658
659 sk->inuse = 1;
660 sti();
661
662 /* Always see if we need to send an ack. */ 663
664 if (sk->ack_backlog && !sk->zapped)
665 { 666 sk->prot->read_wakeup (sk);
667 if (! sk->dead)
668 sk->data_ready(sk,0);
669 } 670
671 /* Now we need to figure out why the socket was on the timer. */ 672
673 switch (why)
674 { 675 /* Window probing */ 676 caseTIME_PROBE0:
677 tcp_send_probe0(sk);
678 if(tcp_write_timeout(sk))
679 release_sock (sk);
680 break;
681 /* Retransmitting */ 682 caseTIME_WRITE:
683 /* It could be we got here because we needed to send an ack. 684 * So we need to check for that. 685 */ 686 { 687 structsk_buff *skb;
688 unsignedlongflags;
689
690 save_flags(flags);
691 cli();
692 skb = sk->send_head;
693 if (!skb)
694 { 695 restore_flags(flags);
696 } 697 else 698 { 699 /* 700 * Kicked by a delayed ack. Reset timer 701 * correctly now 702 */ 703 if (jiffies < skb->when + sk->rto)
704 { 705 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
706 restore_flags(flags);
707 release_sock (sk);
708 break;
709 } 710 restore_flags(flags);
711 /* 712 * Retransmission 713 */ 714 sk->prot->retransmit (sk, 0);
715 if(!tcp_write_timeout(sk))
716 break;
717 } 718 release_sock (sk);
719 break;
720 } 721 /* Sending Keepalives */ 722 caseTIME_KEEPOPEN:
723 /* 724 * this reset_timer() call is a hack, this is not 725 * how KEEPOPEN is supposed to work. 726 */ 727 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
728
729 /* Send something to keep the connection open. */ 730 if (sk->prot->write_wakeup)
731 sk->prot->write_wakeup (sk);
732 sk->retransmits++;
733 if(tcp_write_timeout(sk))
734 release_sock (sk);
735 break;
736 default:
737 printk ("rexmit_timer: timer expired - reason unknown\n");
738 release_sock (sk);
739 break;
740 } 741 } 742
743 /* 744 * This routine is called by the ICMP module when it gets some 745 * sort of error condition. If err < 0 then the socket should 746 * be closed and the error returned to the user. If err > 0 747 * it's just the icmp type << 8 | icmp code. After adjustment 748 * header points to the first 8 bytes of the tcp header. We need 749 * to find the appropriate port. 750 */ 751
752 voidtcp_err(interr, unsignedchar *header, unsignedlongdaddr,
/* */ 753 unsignedlongsaddr, structinet_protocol *protocol)
754 { 755 structtcphdr *th;
756 structsock *sk;
757 structiphdr *iph=(structiphdr *)header;
758
759 header+=4*iph->ihl;
760
761
762 th =(structtcphdr *)header;
763 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
764
765 if (sk == NULL)
766 return;
767
768 if(err<0)
769 { 770 sk->err = -err;
771 sk->error_report(sk);
772 return;
773 } 774
775 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
776 { 777 /* 778 * FIXME: 779 * For now we will just trigger a linear backoff. 780 * The slow start code should cause a real backoff here. 781 */ 782 if (sk->cong_window > 4)
783 sk->cong_window--;
784 return;
785 } 786
787 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ 788
789 /* 790 * If we've already connected we will keep trying 791 * until we time out, or the user gives up. 792 */ 793
794 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
795 { 796 if (sk->state == TCP_SYN_SENT)
797 { 798 tcp_statistics.TcpAttemptFails++;
799 tcp_set_state(sk,TCP_CLOSE);
800 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 801 } 802 sk->err = icmp_err_convert[err & 0xff].errno;
803 } 804 return;
805 } 806
807
808 /* 809 * Walk down the receive queue counting readable data until we hit the end or we find a gap 810 * in the received data queue (ie a frame missing that needs sending to us). Not 811 * sorting using two queues as data arrives makes life so much harder. 812 */ 813
814 staticinttcp_readable(structsock *sk)
/* */ 815 { 816 unsignedlongcounted;
817 unsignedlongamount;
818 structsk_buff *skb;
819 intsum;
820 unsignedlongflags;
821
822 if(sk && sk->debug)
823 printk("tcp_readable: %p - ",sk);
824
825 save_flags(flags);
826 cli();
827 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
828 { 829 restore_flags(flags);
830 if(sk && sk->debug)
831 printk("empty\n");
832 return(0);
833 } 834
835 counted = sk->copied_seq; /* Where we are at the moment */ 836 amount = 0;
837
838 /* 839 * Do until a push or until we are out of data. 840 */ 841
842 do 843 { 844 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ 845 break;
846 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 847 if (skb->h.th->syn)
848 sum++;
849 if (sum > 0)
850 {/* Add it up, move on */ 851 amount += sum;
852 if (skb->h.th->syn)
853 amount--;
854 counted += sum;
855 } 856 /* 857 * Don't count urg data ... but do it in the right place! 858 * Consider: "old_data (ptr is here) URG PUSH data" 859 * The old code would stop at the first push because 860 * it counted the urg (amount==1) and then does amount-- 861 * *after* the loop. This means tcp_readable() always 862 * returned zero if any URG PUSH was in the queue, even 863 * though there was normal data available. If we subtract 864 * the urg data right here, we even get it to work for more 865 * than one URG PUSH skb without normal data. 866 * This means that select() finally works now with urg data 867 * in the queue. Note that rlogin was never affected 868 * because it doesn't use select(); it uses two processes 869 * and a blocking read(). And the queue scan in tcp_read() 870 * was correct. Mike <pall@rz.uni-karlsruhe.de> 871 */ 872 if (skb->h.th->urg)
873 amount--; /* don't count urg data */ 874 if (amount && skb->h.th->psh) break;
875 skb = skb->next;
876 } 877 while(skb != (structsk_buff *)&sk->receive_queue);
878
879 restore_flags(flags);
880 if(sk->debug)
881 printk("got %lu bytes.\n",amount);
882 return(amount);
883 } 884
885 /* 886 * LISTEN is a special case for select.. 887 */ 888 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 889 { 890 if (sel_type == SEL_IN) { 891 intretval;
892
893 sk->inuse = 1;
894 retval = (tcp_find_established(sk) != NULL);
895 release_sock(sk);
896 if (!retval)
897 select_wait(&master_select_wakeup,wait);
898 returnretval;
899 } 900 return 0;
901 } 902
903
904 /* 905 * Wait for a TCP event. 906 * 907 * Note that we don't need to set "sk->inuse", as the upper select layers 908 * take care of normal races (between the test and the event) and we don't 909 * go look at any of the socket buffers directly. 910 */ 911 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 912 { 913 if (sk->state == TCP_LISTEN)
914 returntcp_listen_select(sk, sel_type, wait);
915
916 switch(sel_type) { 917 caseSEL_IN:
918 if (sk->err)
919 return 1;
920 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
921 break;
922
923 if (sk->shutdown & RCV_SHUTDOWN)
924 return 1;
925
926 if (sk->acked_seq == sk->copied_seq)
927 break;
928
929 if (sk->urg_seq != sk->copied_seq ||
930 sk->acked_seq != sk->copied_seq+1 ||
931 sk->urginline || !sk->urg_data)
932 return 1;
933 break;
934
935 caseSEL_OUT:
936 if (sk->shutdown & SEND_SHUTDOWN)
937 return 0;
938 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
939 break;
940 /* 941 * This is now right thanks to a small fix 942 * by Matt Dillon. 943 */ 944
945 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
946 break;
947 return 1;
948
949 caseSEL_EX:
950 if (sk->err || sk->urg_data)
951 return 1;
952 break;
953 } 954 select_wait(sk->sleep, wait);
955 return 0;
956 } 957
958 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 959 { 960 interr;
961 switch(cmd)
962 { 963
964 caseTIOCINQ:
965 #ifdef FIXME /* FIXME: */ 966 caseFIONREAD:
967 #endif 968 { 969 unsignedlongamount;
970
971 if (sk->state == TCP_LISTEN)
972 return(-EINVAL);
973
974 sk->inuse = 1;
975 amount = tcp_readable(sk);
976 release_sock(sk);
977 err=verify_area(VERIFY_WRITE,(void *)arg,
978 sizeof(unsignedlong));
979 if(err)
980 returnerr;
981 put_fs_long(amount,(unsignedlong *)arg);
982 return(0);
983 } 984 caseSIOCATMARK:
985 { 986 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
987
988 err = verify_area(VERIFY_WRITE,(void *) arg,
989 sizeof(unsignedlong));
990 if (err)
991 returnerr;
992 put_fs_long(answ,(int *) arg);
993 return(0);
994 } 995 caseTIOCOUTQ:
996 { 997 unsignedlongamount;
998
999 if (sk->state == TCP_LISTEN) return(-EINVAL);
1000 amount = sk->prot->wspace(sk);
1001 err=verify_area(VERIFY_WRITE,(void *)arg,
1002 sizeof(unsignedlong));
1003 if(err)
1004 returnerr;
1005 put_fs_long(amount,(unsignedlong *)arg);
1006 return(0);
1007 }1008 default:
1009 return(-EINVAL);
1010 }1011 }1012
1013
1014 /*1015 * This routine computes a TCP checksum. 1016 */1017
1018 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1019 unsignedlongsaddr, unsignedlongdaddr)
1020 {1021 unsignedlongsum;
1022
1023 if (saddr == 0) saddr = ip_my_addr();
1024
1025 /*1026 * stupid, gcc complains when I use just one __asm__ block,1027 * something about too many reloads, but this is just two1028 * instructions longer than what I want1029 */1030 __asm__("
1031 addl %%ecx, %%ebx
1032 adcl %%edx, %%ebx
1033 adcl $0, %%ebx
1034 "
1035 : "=b"(sum)
1036 : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1037 : "bx", "cx", "dx" );
1038 __asm__("
1039 movl %%ecx, %%edx
1040 cld
1041 cmpl $32, %%ecx
1042 jb 2f
1043 shrl $5, %%ecx
1044 clc
1045 1: lodsl
1046 adcl %%eax, %%ebx
1047 lodsl
1048 adcl %%eax, %%ebx
1049 lodsl
1050 adcl %%eax, %%ebx
1051 lodsl
1052 adcl %%eax, %%ebx
1053 lodsl
1054 adcl %%eax, %%ebx
1055 lodsl
1056 adcl %%eax, %%ebx
1057 lodsl
1058 adcl %%eax, %%ebx
1059 lodsl
1060 adcl %%eax, %%ebx
1061 loop 1b
1062 adcl $0, %%ebx
1063 movl %%edx, %%ecx
1064 2: andl $28, %%ecx
1065 je 4f
1066 shrl $2, %%ecx
1067 clc
1068 3: lodsl
1069 adcl %%eax, %%ebx
1070 loop 3b
1071 adcl $0, %%ebx
1072 4: movl $0, %%eax
1073 testw $2, %%dx
1074 je 5f
1075 lodsw
1076 addl %%eax, %%ebx
1077 adcl $0, %%ebx
1078 movw $0, %%ax
1079 5: test $1, %%edx
1080 je 6f
1081 lodsb
1082 addl %%eax, %%ebx
1083 adcl $0, %%ebx
1084 6: movl %%ebx, %%eax
1085 shrl $16, %%eax
1086 addw %%ax, %%bx
1087 adcw $0, %%bx
1088 "
1089 : "=b"(sum)
1090 : "0"(sum), "c"(len), "S"(th)
1091 : "ax", "bx", "cx", "dx", "si" );
1092
1093 /* We only want the bottom 16 bits, but we never cleared the top 16. */1094
1095 return((~sum) & 0xffff);
1096 }1097
1098
1099
1100 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1101 unsignedlongdaddr, intlen, structsock *sk)
1102 {1103 th->check = 0;
1104 th->check = tcp_check(th, len, saddr, daddr);
1105 return;
1106 }1107
1108 /*1109 * This is the main buffer sending routine. We queue the buffer1110 * having checked it is sane seeming.1111 */1112
1113 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1114 {1115 intsize;
1116 structtcphdr * th = skb->h.th;
1117
1118 /*1119 * length of packet (not counting length of pre-tcp headers) 1120 */1121
1122 size = skb->len - ((unsignedchar *) th - skb->data);
1123
1124 /*1125 * Sanity check it.. 1126 */1127
1128 if (size < sizeof(structtcphdr) || size > skb->len)
1129 {1130 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1131 skb, skb->data, th, skb->len);
1132 kfree_skb(skb, FREE_WRITE);
1133 return;
1134 }1135
1136 /*1137 * If we have queued a header size packet.. (these crash a few1138 * tcp stacks if ack is not set)1139 */1140
1141 if (size == sizeof(structtcphdr))
1142 {1143 /* If its got a syn or fin its notionally included in the size..*/1144 if(!th->syn && !th->fin)
1145 {1146 printk("tcp_send_skb: attempt to queue a bogon.\n");
1147 kfree_skb(skb,FREE_WRITE);
1148 return;
1149 }1150 }1151
1152 /*1153 * Actual processing.1154 */1155
1156 tcp_statistics.TcpOutSegs++;
1157 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1158
1159 /*1160 * We must queue if1161 *1162 * a) The right edge of this frame exceeds the window1163 * b) We are retransmitting (Nagle's rule)1164 * c) We have too many packets 'in flight'1165 */1166
1167 if (after(skb->h.seq, sk->window_seq) ||
1168 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1169 sk->packets_out >= sk->cong_window)
1170 {1171 /* checksum will be supplied by tcp_write_xmit. So1172 * we shouldn't need to set it at all. I'm being paranoid */1173 th->check = 0;
1174 if (skb->next != NULL)
1175 {1176 printk("tcp_send_partial: next != NULL\n");
1177 skb_unlink(skb);
1178 }1179 skb_queue_tail(&sk->write_queue, skb);
1180
1181 /*1182 * If we don't fit we have to start the zero window1183 * probes. This is broken - we really need to do a partial1184 * send _first_ (This is what causes the Cisco and PC/TCP1185 * grief).1186 */1187
1188 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1189 sk->send_head == NULL && sk->ack_backlog == 0)
1190 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1191 }1192 else1193 {1194 /*1195 * This is going straight out1196 */1197
1198 th->ack_seq = ntohl(sk->acked_seq);
1199 th->window = ntohs(tcp_select_window(sk));
1200
1201 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1202
1203 sk->sent_seq = sk->write_seq;
1204
1205 /*1206 * This is mad. The tcp retransmit queue is put together1207 * by the ip layer. This causes half the problems with1208 * unroutable FIN's and other things.1209 */1210
1211 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1212
1213 /*1214 * Set for next retransmit based on expected ACK time.1215 * FIXME: We set this every time which means our 1216 * retransmits are really about a window behind.1217 */1218
1219 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1220 }1221 }1222
1223 /*1224 * Locking problems lead us to a messy situation where we can have1225 * multiple partially complete buffers queued up. This is really bad1226 * as we don't want to be sending partial buffers. Fix this with1227 * a semaphore or similar to lock tcp_write per socket.1228 *1229 * These routines are pretty self descriptive.1230 */1231
1232 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1233 {1234 structsk_buff * skb;
1235 unsignedlongflags;
1236
1237 save_flags(flags);
1238 cli();
1239 skb = sk->partial;
1240 if (skb) {1241 sk->partial = NULL;
1242 del_timer(&sk->partial_timer);
1243 }1244 restore_flags(flags);
1245 returnskb;
1246 }1247
1248 /*1249 * Empty the partial queue1250 */1251
1252 staticvoidtcp_send_partial(structsock *sk)
/* */1253 {1254 structsk_buff *skb;
1255
1256 if (sk == NULL)
1257 return;
1258 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1259 tcp_send_skb(sk, skb);
1260 }1261
1262 /*1263 * Queue a partial frame1264 */1265
1266 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1267 {1268 structsk_buff * tmp;
1269 unsignedlongflags;
1270
1271 save_flags(flags);
1272 cli();
1273 tmp = sk->partial;
1274 if (tmp)
1275 del_timer(&sk->partial_timer);
1276 sk->partial = skb;
1277 init_timer(&sk->partial_timer);
1278 /*1279 * Wait up to 1 second for the buffer to fill.1280 */1281 sk->partial_timer.expires = HZ;
1282 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1283 sk->partial_timer.data = (unsignedlong) sk;
1284 add_timer(&sk->partial_timer);
1285 restore_flags(flags);
1286 if (tmp)
1287 tcp_send_skb(sk, tmp);
1288 }1289
1290
1291 /*1292 * This routine sends an ack and also updates the window. 1293 */1294
1295 staticvoidtcp_send_ack(unsignedlongsequence, unsignedlongack,
/* */1296 structsock *sk,
1297 structtcphdr *th, unsignedlongdaddr)
1298 {1299 structsk_buff *buff;
1300 structtcphdr *t1;
1301 structdevice *dev = NULL;
1302 inttmp;
1303
1304 if(sk->zapped)
1305 return; /* We have been reset, we may not send again */1306
1307 /*1308 * We need to grab some memory, and put together an ack,1309 * and then put it into the queue to be sent.1310 */1311
1312 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1313 if (buff == NULL)
1314 {1315 /* 1316 * Force it to send an ack. We don't have to do this1317 * (ACK is unreliable) but its much better use of 1318 * bandwidth on slow links to send a spare ack than1319 * resend packets. 1320 */1321
1322 sk->ack_backlog++;
1323 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1324 {1325 reset_xmit_timer(sk, TIME_WRITE, HZ);
1326 }1327 return;
1328 }1329
1330 /*1331 * Assemble a suitable TCP frame1332 */1333
1334 buff->len = sizeof(structtcphdr);
1335 buff->sk = sk;
1336 buff->localroute = sk->localroute;
1337 t1 =(structtcphdr *) buff->data;
1338
1339 /* 1340 * Put in the IP header and routing stuff. 1341 */1342
1343 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1344 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1345 if (tmp < 0)
1346 {1347 buff->free = 1;
1348 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1349 return;
1350 }1351 buff->len += tmp;
1352 t1 =(structtcphdr *)((char *)t1 +tmp);
1353
1354 memcpy(t1, th, sizeof(*t1));
1355
1356 /*1357 * Swap the send and the receive. 1358 */1359
1360 t1->dest = th->source;
1361 t1->source = th->dest;
1362 t1->seq = ntohl(sequence);
1363 t1->ack = 1;
1364 sk->window = tcp_select_window(sk);
1365 t1->window = ntohs(sk->window);
1366 t1->res1 = 0;
1367 t1->res2 = 0;
1368 t1->rst = 0;
1369 t1->urg = 0;
1370 t1->syn = 0;
1371 t1->psh = 0;
1372 t1->fin = 0;
1373
1374 /*1375 * If we have nothing queued for transmit and the transmit timer1376 * is on we are just doing an ACK timeout and need to switch1377 * to a keepalive.1378 */1379
1380 if (ack == sk->acked_seq)
1381 {1382 sk->ack_backlog = 0;
1383 sk->bytes_rcv = 0;
1384 sk->ack_timed = 0;
1385 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1386 && sk->ip_xmit_timeout == TIME_WRITE)
1387 {1388 if(sk->keepopen) {1389 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1390 }else{1391 delete_timer(sk);
1392 }1393 }1394 }1395
1396 /*1397 * Fill in the packet and send it1398 */1399
1400 t1->ack_seq = ntohl(ack);
1401 t1->doff = sizeof(*t1)/4;
1402 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1403 if (sk->debug)
1404 printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1405 tcp_statistics.TcpOutSegs++;
1406 sk->prot->queue_xmit(sk, dev, buff, 1);
1407 }1408
1409
1410 /* 1411 * This routine builds a generic TCP header. 1412 */1413
1414 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1415 {1416
1417 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1418 th->seq = htonl(sk->write_seq);
1419 th->psh =(push == 0) ? 1 : 0;
1420 th->doff = sizeof(*th)/4;
1421 th->ack = 1;
1422 th->fin = 0;
1423 sk->ack_backlog = 0;
1424 sk->bytes_rcv = 0;
1425 sk->ack_timed = 0;
1426 th->ack_seq = htonl(sk->acked_seq);
1427 sk->window = tcp_select_window(sk);
1428 th->window = htons(sk->window);
1429
1430 return(sizeof(*th));
1431 }1432
1433 /*1434 * This routine copies from a user buffer into a socket,1435 * and starts the transmit system.1436 */1437
1438 staticinttcp_write(structsock *sk, unsignedchar *from,
/* */1439 intlen, intnonblock, unsignedflags)
1440 {1441 intcopied = 0;
1442 intcopy;
1443 inttmp;
1444 structsk_buff *skb;
1445 structsk_buff *send_tmp;
1446 unsignedchar *buff;
1447 structproto *prot;
1448 structdevice *dev = NULL;
1449
1450 sk->inuse=1;
1451 prot = sk->prot;
1452 while(len > 0)
1453 {1454 if (sk->err)
1455 {/* Stop on an error */1456 release_sock(sk);
1457 if (copied)
1458 return(copied);
1459 tmp = -sk->err;
1460 sk->err = 0;
1461 return(tmp);
1462 }1463
1464 /*1465 * First thing we do is make sure that we are established. 1466 */1467
1468 if (sk->shutdown & SEND_SHUTDOWN)
1469 {1470 release_sock(sk);
1471 sk->err = EPIPE;
1472 if (copied)
1473 return(copied);
1474 sk->err = 0;
1475 return(-EPIPE);
1476 }1477
1478 /* 1479 * Wait for a connection to finish.1480 */1481
1482 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1483 {1484 if (sk->err)
1485 {1486 release_sock(sk);
1487 if (copied)
1488 return(copied);
1489 tmp = -sk->err;
1490 sk->err = 0;
1491 return(tmp);
1492 }1493
1494 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1495 {1496 release_sock(sk);
1497 if (copied)
1498 return(copied);
1499
1500 if (sk->err)
1501 {1502 tmp = -sk->err;
1503 sk->err = 0;
1504 return(tmp);
1505 }1506
1507 if (sk->keepopen)
1508 {1509 send_sig(SIGPIPE, current, 0);
1510 }1511 return(-EPIPE);
1512 }1513
1514 if (nonblock || copied)
1515 {1516 release_sock(sk);
1517 if (copied)
1518 return(copied);
1519 return(-EAGAIN);
1520 }1521
1522 release_sock(sk);
1523 cli();
1524
1525 if (sk->state != TCP_ESTABLISHED &&
1526 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1527 {1528 interruptible_sleep_on(sk->sleep);
1529 if (current->signal & ~current->blocked)
1530 {1531 sti();
1532 if (copied)
1533 return(copied);
1534 return(-ERESTARTSYS);
1535 }1536 }1537 sk->inuse = 1;
1538 sti();
1539 }1540
1541 /*1542 * The following code can result in copy <= if sk->mss is ever1543 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1544 * sk->mtu is constant once SYN processing is finished. I.e. we1545 * had better not get here until we've seen his SYN and at least one1546 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1547 * But ESTABLISHED should guarantee that. sk->max_window is by definition1548 * non-decreasing. Note that any ioctl to set user_mss must be done1549 * before the exchange of SYN's. If the initial ack from the other1550 * end has a window of 0, max_window and thus mss will both be 0.1551 */1552
1553 /* 1554 * Now we need to check if we have a half built packet. 1555 */1556
1557 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1558 {1559 inthdrlen;
1560
1561 /* IP header + TCP header */1562 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1563 + sizeof(structtcphdr);
1564
1565 /* Add more stuff to the end of skb->len */1566 if (!(flags & MSG_OOB))
1567 {1568 copy = min(sk->mss - (skb->len - hdrlen), len);
1569 /* FIXME: this is really a bug. */1570 if (copy <= 0)
1571 {1572 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1573 copy = 0;
1574 }1575
1576 memcpy_fromfs(skb->data + skb->len, from, copy);
1577 skb->len += copy;
1578 from += copy;
1579 copied += copy;
1580 len -= copy;
1581 sk->write_seq += copy;
1582 }1583 if ((skb->len - hdrlen) >= sk->mss ||
1584 (flags & MSG_OOB) || !sk->packets_out)
1585 tcp_send_skb(sk, skb);
1586 else1587 tcp_enqueue_partial(skb, sk);
1588 continue;
1589 }1590
1591 /*1592 * We also need to worry about the window.1593 * If window < 1/2 the maximum window we've seen from this1594 * host, don't use it. This is sender side1595 * silly window prevention, as specified in RFC1122.1596 * (Note that this is different than earlier versions of1597 * SWS prevention, e.g. RFC813.). What we actually do is 1598 * use the whole MSS. Since the results in the right1599 * edge of the packet being outside the window, it will1600 * be queued for later rather than sent.1601 */1602
1603 copy = sk->window_seq - sk->write_seq;
1604 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1605 copy = sk->mss;
1606 if (copy > len)
1607 copy = len;
1608
1609 /*1610 * We should really check the window here also. 1611 */1612
1613 send_tmp = NULL;
1614 if (copy < sk->mss && !(flags & MSG_OOB))
1615 {1616 /*1617 * We will release the socket incase we sleep here. 1618 */1619 release_sock(sk);
1620 /*1621 * NB: following must be mtu, because mss can be increased.1622 * mss is always <= mtu 1623 */1624 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1625 sk->inuse = 1;
1626 send_tmp = skb;
1627 }1628 else1629 {1630 /*1631 * We will release the socket incase we sleep here. 1632 */1633 release_sock(sk);
1634 skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1635 sk->inuse = 1;
1636 }1637
1638 /*1639 * If we didn't get any memory, we need to sleep. 1640 */1641
1642 if (skb == NULL)
1643 {1644 sk->socket->flags |= SO_NOSPACE;
1645 if (nonblock)
1646 {1647 release_sock(sk);
1648 if (copied)
1649 return(copied);
1650 return(-EAGAIN);
1651 }1652
1653 /*1654 * FIXME: here is another race condition. 1655 */1656
1657 tmp = sk->wmem_alloc;
1658 release_sock(sk);
1659 cli();
1660 /*1661 * Again we will try to avoid it. 1662 */1663 if (tmp <= sk->wmem_alloc &&
1664 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1665 && sk->err == 0)
1666 {1667 sk->socket->flags &= ~SO_NOSPACE;
1668 interruptible_sleep_on(sk->sleep);
1669 if (current->signal & ~current->blocked)
1670 {1671 sti();
1672 if (copied)
1673 return(copied);
1674 return(-ERESTARTSYS);
1675 }1676 }1677 sk->inuse = 1;
1678 sti();
1679 continue;
1680 }1681
1682 skb->len = 0;
1683 skb->sk = sk;
1684 skb->free = 0;
1685 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1686
1687 buff = skb->data;
1688
1689 /*1690 * FIXME: we need to optimize this.1691 * Perhaps some hints here would be good.1692 */1693
1694 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1695 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1696 if (tmp < 0 )
1697 {1698 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1699 release_sock(sk);
1700 if (copied)
1701 return(copied);
1702 return(tmp);
1703 }1704 skb->len += tmp;
1705 skb->dev = dev;
1706 buff += tmp;
1707 skb->h.th =(structtcphdr *) buff;
1708 tmp = tcp_build_header((structtcphdr *)buff, sk, len-copy);
1709 if (tmp < 0)
1710 {1711 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1712 release_sock(sk);
1713 if (copied)
1714 return(copied);
1715 return(tmp);
1716 }1717
1718 if (flags & MSG_OOB)
1719 {1720 ((structtcphdr *)buff)->urg = 1;
1721 ((structtcphdr *)buff)->urg_ptr = ntohs(copy);
1722 }1723 skb->len += tmp;
1724 memcpy_fromfs(buff+tmp, from, copy);
1725
1726 from += copy;
1727 copied += copy;
1728 len -= copy;
1729 skb->len += copy;
1730 skb->free = 0;
1731 sk->write_seq += copy;
1732
1733 if (send_tmp != NULL && sk->packets_out)
1734 {1735 tcp_enqueue_partial(send_tmp, sk);
1736 continue;
1737 }1738 tcp_send_skb(sk, skb);
1739 }1740 sk->err = 0;
1741
1742 /*1743 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1744 * interactive fast network servers. It's meant to be on and1745 * it really improves the throughput though not the echo time1746 * on my slow slip link - Alan1747 */1748
1749 /*1750 * Avoid possible race on send_tmp - c/o Johannes Stille 1751 */1752
1753 if(sk->partial && ((!sk->packets_out)
1754 /* If not nagling we can send on the before case too.. */1755 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1756 ))
1757 tcp_send_partial(sk);
1758
1759 release_sock(sk);
1760 return(copied);
1761 }1762
1763 /*1764 * This is just a wrapper. 1765 */1766
1767 staticinttcp_sendto(structsock *sk, unsignedchar *from,
/* */1768 intlen, intnonblock, unsignedflags,
1769 structsockaddr_in *addr, intaddr_len)
1770 {1771 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1772 return -EINVAL;
1773 if (sk->state == TCP_CLOSE)
1774 return -ENOTCONN;
1775 if (addr_len < sizeof(*addr))
1776 return -EINVAL;
1777 if (addr->sin_family && addr->sin_family != AF_INET)
1778 return -EINVAL;
1779 if (addr->sin_port != sk->dummy_th.dest)
1780 return -EISCONN;
1781 if (addr->sin_addr.s_addr != sk->daddr)
1782 return -EISCONN;
1783 returntcp_write(sk, from, len, nonblock, flags);
1784 }1785
1786
1787 /*1788 * Send an ack if one is backlogged at this point. Ought to merge1789 * this with tcp_send_ack().1790 */1791
1792 staticvoidtcp_read_wakeup(structsock *sk)
/* */1793 {1794 inttmp;
1795 structdevice *dev = NULL;
1796 structtcphdr *t1;
1797 structsk_buff *buff;
1798
1799 if (!sk->ack_backlog)
1800 return;
1801
1802 /*1803 * FIXME: we need to put code here to prevent this routine from1804 * being called. Being called once in a while is ok, so only check1805 * if this is the second time in a row.1806 */1807
1808 /*1809 * We need to grab some memory, and put together an ack,1810 * and then put it into the queue to be sent.1811 */1812
1813 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1814 if (buff == NULL)
1815 {1816 /* Try again real soon. */1817 reset_xmit_timer(sk, TIME_WRITE, HZ);
1818 return;
1819 }1820
1821 buff->len = sizeof(structtcphdr);
1822 buff->sk = sk;
1823 buff->localroute = sk->localroute;
1824
1825 /*1826 * Put in the IP header and routing stuff. 1827 */1828
1829 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1830 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1831 if (tmp < 0)
1832 {1833 buff->free = 1;
1834 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1835 return;
1836 }1837
1838 buff->len += tmp;
1839 t1 =(structtcphdr *)(buff->data +tmp);
1840
1841 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1842 t1->seq = htonl(sk->sent_seq);
1843 t1->ack = 1;
1844 t1->res1 = 0;
1845 t1->res2 = 0;
1846 t1->rst = 0;
1847 t1->urg = 0;
1848 t1->syn = 0;
1849 t1->psh = 0;
1850 sk->ack_backlog = 0;
1851 sk->bytes_rcv = 0;
1852 sk->window = tcp_select_window(sk);
1853 t1->window = ntohs(sk->window);
1854 t1->ack_seq = ntohl(sk->acked_seq);
1855 t1->doff = sizeof(*t1)/4;
1856 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1857 sk->prot->queue_xmit(sk, dev, buff, 1);
1858 tcp_statistics.TcpOutSegs++;
1859 }1860
1861
1862 /*1863 * FIXME:1864 * This routine frees used buffers.1865 * It should consider sending an ACK to let the1866 * other end know we now have a bigger window.1867 */1868
1869 staticvoidcleanup_rbuf(structsock *sk)
/* */1870 {1871 unsignedlongflags;
1872 unsignedlongleft;
1873 structsk_buff *skb;
1874 unsignedlongrspace;
1875
1876 if(sk->debug)
1877 printk("cleaning rbuf for sk=%p\n", sk);
1878
1879 save_flags(flags);
1880 cli();
1881
1882 left = sk->prot->rspace(sk);
1883
1884 /*1885 * We have to loop through all the buffer headers,1886 * and try to free up all the space we can.1887 */1888
1889 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1890 {1891 if (!skb->used || skb->users)
1892 break;
1893 skb_unlink(skb);
1894 skb->sk = sk;
1895 kfree_skb(skb, FREE_READ);
1896 }1897
1898 restore_flags(flags);
1899
1900 /*1901 * FIXME:1902 * At this point we should send an ack if the difference1903 * in the window, and the amount of space is bigger than1904 * TCP_WINDOW_DIFF.1905 */1906
1907 if(sk->debug)
1908 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1909 left);
1910 if ((rspace=sk->prot->rspace(sk)) != left)
1911 {1912 /*1913 * This area has caused the most trouble. The current strategy1914 * is to simply do nothing if the other end has room to send at1915 * least 3 full packets, because the ack from those will auto-1916 * matically update the window. If the other end doesn't think1917 * we have much space left, but we have room for at least 1 more1918 * complete packet than it thinks we do, we will send an ack1919 * immediately. Otherwise we will wait up to .5 seconds in case1920 * the user reads some more.1921 */1922 sk->ack_backlog++;
1923 /*1924 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1925 * if the other end is offering a window smaller than the agreed on MSS1926 * (called sk->mtu here). In theory there's no connection between send1927 * and receive, and so no reason to think that they're going to send1928 * small packets. For the moment I'm using the hack of reducing the mss1929 * only on the send side, so I'm putting mtu here.1930 */1931
1932 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1933 {1934 /* Send an ack right now. */1935 tcp_read_wakeup(sk);
1936 }1937 else1938 {1939 /* Force it to send an ack soon. */1940 intwas_active = del_timer(&sk->retransmit_timer);
1941 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1942 {1943 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1944 }1945 else1946 add_timer(&sk->retransmit_timer);
1947 }1948 }1949 }1950
1951
1952 /*1953 * Handle reading urgent data. BSD has very simple semantics for1954 * this, no blocking and very strange errors 8)1955 */1956
1957 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */1958 unsignedchar *to, intlen, unsignedflags)
1959 {1960 /*1961 * No URG data to read1962 */1963 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1964 return -EINVAL; /* Yes this is right ! */1965
1966 if (sk->err)
1967 {1968 inttmp = -sk->err;
1969 sk->err = 0;
1970 returntmp;
1971 }1972
1973 if (sk->state == TCP_CLOSE || sk->done)
1974 {1975 if (!sk->done) {1976 sk->done = 1;
1977 return 0;
1978 }1979 return -ENOTCONN;
1980 }1981
1982 if (sk->shutdown & RCV_SHUTDOWN)
1983 {1984 sk->done = 1;
1985 return 0;
1986 }1987 sk->inuse = 1;
1988 if (sk->urg_data & URG_VALID)
1989 {1990 charc = sk->urg_data;
1991 if (!(flags & MSG_PEEK))
1992 sk->urg_data = URG_READ;
1993 put_fs_byte(c, to);
1994 release_sock(sk);
1995 return 1;
1996 }1997 release_sock(sk);
1998
1999 /*2000 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2001 * the available implementations agree in this case:2002 * this call should never block, independent of the2003 * blocking state of the socket.2004 * Mike <pall@rz.uni-karlsruhe.de>2005 */2006 return -EAGAIN;
2007 }2008
2009
2010 /*2011 * This routine copies from a sock struct into the user buffer. 2012 */2013
2014 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */2015 intlen, intnonblock, unsignedflags)
2016 {2017 structwait_queuewait = {current, NULL};
2018 intcopied = 0;
2019 unsignedlongpeek_seq;
2020 volatileunsignedlong *seq; /* So gcc doesnt overoptimise */2021 unsignedlongused;
2022
2023 /* 2024 * This error should be checked. 2025 */2026
2027 if (sk->state == TCP_LISTEN)
2028 return -ENOTCONN;
2029
2030 /*2031 * Urgent data needs to be handled specially. 2032 */2033
2034 if (flags & MSG_OOB)
2035 returntcp_read_urg(sk, nonblock, to, len, flags);
2036
2037 /*2038 * Copying sequence to update. This is volatile to handle2039 * the multi-reader case neatly (memcpy_to/fromfs might be 2040 * inline and thus not flush cached variables otherwise).2041 */2042
2043 peek_seq = sk->copied_seq;
2044 seq = &sk->copied_seq;
2045 if (flags & MSG_PEEK)
2046 seq = &peek_seq;
2047
2048 add_wait_queue(sk->sleep, &wait);
2049 sk->inuse = 1;
2050 while (len > 0)
2051 {2052 structsk_buff * skb;
2053 unsignedlongoffset;
2054
2055 /*2056 * Are we at urgent data? Stop if we have read anything.2057 */2058
2059 if (copied && sk->urg_data && sk->urg_seq == *seq)
2060 break;
2061
2062 /*2063 * Next get a buffer.2064 */2065
2066 current->state = TASK_INTERRUPTIBLE;
2067
2068 skb = skb_peek(&sk->receive_queue);
2069 do2070 {2071 if (!skb)
2072 break;
2073 if (before(*seq, skb->h.th->seq))
2074 break;
2075 offset = *seq - skb->h.th->seq;
2076 if (skb->h.th->syn)
2077 offset--;
2078 if (offset < skb->len)
2079 gotofound_ok_skb;
2080 if (skb->h.th->fin)
2081 gotofound_fin_ok;
2082 if (!(flags & MSG_PEEK))
2083 skb->used = 1;
2084 skb = skb->next;
2085 }2086 while (skb != (structsk_buff *)&sk->receive_queue);
2087
2088 if (copied)
2089 break;
2090
2091 if (sk->err)
2092 {2093 copied = -sk->err;
2094 sk->err = 0;
2095 break;
2096 }2097
2098 if (sk->state == TCP_CLOSE)
2099 {2100 if (!sk->done)
2101 {2102 sk->done = 1;
2103 break;
2104 }2105 copied = -ENOTCONN;
2106 break;
2107 }2108
2109 if (sk->shutdown & RCV_SHUTDOWN)
2110 {2111 sk->done = 1;
2112 break;
2113 }2114
2115 if (nonblock)
2116 {2117 copied = -EAGAIN;
2118 break;
2119 }2120
2121 cleanup_rbuf(sk);
2122 release_sock(sk);
2123 sk->socket->flags |= SO_WAITDATA;
2124 schedule();
2125 sk->socket->flags &= ~SO_WAITDATA;
2126 sk->inuse = 1;
2127
2128 if (current->signal & ~current->blocked)
2129 {2130 copied = -ERESTARTSYS;
2131 break;
2132 }2133 continue;
2134
2135 found_ok_skb:
2136 /*2137 * Lock the buffer. We can be fairly relaxed as2138 * an interrupt will never steal a buffer we are 2139 * using unless I've missed something serious in2140 * tcp_data.2141 */2142
2143 skb->users++;
2144
2145 /*2146 * Ok so how much can we use ? 2147 */2148
2149 used = skb->len - offset;
2150 if (len < used)
2151 used = len;
2152 /*2153 * Do we have urgent data here? 2154 */2155
2156 if (sk->urg_data)
2157 {2158 unsignedlongurg_offset = sk->urg_seq - *seq;
2159 if (urg_offset < used)
2160 {2161 if (!urg_offset)
2162 {2163 if (!sk->urginline)
2164 {2165 ++*seq;
2166 offset++;
2167 used--;
2168 }2169 }2170 else2171 used = urg_offset;
2172 }2173 }2174
2175 /*2176 * Copy it - We _MUST_ update *seq first so that we2177 * don't ever double read when we have dual readers2178 */2179
2180 *seq += used;
2181
2182 /*2183 * This memcpy_tofs can sleep. If it sleeps and we2184 * do a second read it relies on the skb->users to avoid2185 * a crash when cleanup_rbuf() gets called.2186 */2187
2188 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2189 skb->h.th->doff*4 + offset, used);
2190 copied += used;
2191 len -= used;
2192 to += used;
2193
2194 /*2195 * We now will not sleep again until we are finished2196 * with skb. Sorry if you are doing the SMP port2197 * but you'll just have to fix it neatly ;)2198 */2199
2200 skb->users --;
2201
2202 if (after(sk->copied_seq,sk->urg_seq))
2203 sk->urg_data = 0;
2204 if (used + offset < skb->len)
2205 continue;
2206
2207 /*2208 * Process the FIN.2209 */2210
2211 if (skb->h.th->fin)
2212 gotofound_fin_ok;
2213 if (flags & MSG_PEEK)
2214 continue;
2215 skb->used = 1;
2216 continue;
2217
2218 found_fin_ok:
2219 ++*seq;
2220 if (flags & MSG_PEEK)
2221 break;
2222
2223 /*2224 * All is done2225 */2226
2227 skb->used = 1;
2228 sk->shutdown |= RCV_SHUTDOWN;
2229 break;
2230
2231 }2232 remove_wait_queue(sk->sleep, &wait);
2233 current->state = TASK_RUNNING;
2234
2235 /* Clean up data we have read: This will do ACK frames */2236 cleanup_rbuf(sk);
2237 release_sock(sk);
2238 returncopied;
2239 }2240
2241 /*2242 * State processing on a close. This implements the state shift for2243 * sending our FIN frame. Note that we only send a FIN for some 2244 * states. A shutdown() may have already sent the FIN, or we may be2245 * closed.2246 */2247
2248 staticinttcp_close_state(structsock *sk, intdead)
/* */2249 {2250 intns=TCP_CLOSE;
2251 intsend_fin=0;
2252 switch(sk->state)
2253 {2254 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2255 break;
2256 caseTCP_SYN_RECV:
2257 caseTCP_ESTABLISHED: /* Closedown begin */2258 ns=TCP_FIN_WAIT1;
2259 send_fin=1;
2260 break;
2261 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2262 caseTCP_FIN_WAIT2:
2263 caseTCP_CLOSING:
2264 ns=sk->state;
2265 break;
2266 caseTCP_CLOSE:
2267 caseTCP_LISTEN:
2268 break;
2269 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2270 wait only for the ACK */2271 ns=TCP_LAST_ACK;
2272 send_fin=1;
2273 }2274
2275 tcp_set_state(sk,ns);
2276
2277 /*2278 * This is a (useful) BSD violating of the RFC. There is a2279 * problem with TCP as specified in that the other end could2280 * keep a socket open forever with no application left this end.2281 * We use a 3 minute timeout (about the same as BSD) then kill2282 * our end. If they send after that then tough - BUT: long enough2283 * that we won't make the old 4*rto = almost no time - whoops2284 * reset mistake.2285 */2286 if(dead && ns==TCP_FIN_WAIT2)
2287 {2288 inttimer_active=del_timer(&sk->timer);
2289 if(timer_active)
2290 add_timer(&sk->timer);
2291 else2292 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2293 }2294
2295 returnsend_fin;
2296 }2297
2298 /*2299 * Send a fin.2300 */2301
2302 staticvoidtcp_send_fin(structsock *sk)
/* */2303 {2304 structproto *prot =(structproto *)sk->prot;
2305 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2306 structtcphdr *t1;
2307 structsk_buff *buff;
2308 structdevice *dev=NULL;
2309 inttmp;
2310
2311 release_sock(sk); /* in case the malloc sleeps. */2312
2313 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2314 sk->inuse = 1;
2315
2316 if (buff == NULL)
2317 {2318 /* This is a disaster if it occurs */2319 printk("tcp_send_fin: Impossible malloc failure");
2320 return;
2321 }2322
2323 /*2324 * Administrivia2325 */2326
2327 buff->sk = sk;
2328 buff->len = sizeof(*t1);
2329 buff->localroute = sk->localroute;
2330 t1 =(structtcphdr *) buff->data;
2331
2332 /*2333 * Put in the IP header and routing stuff. 2334 */2335
2336 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2337 IPPROTO_TCP, sk->opt,
2338 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2339 if (tmp < 0)
2340 {2341 intt;
2342 /*2343 * Finish anyway, treat this as a send that got lost. 2344 * (Not good).2345 */2346
2347 buff->free = 1;
2348 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2349 sk->write_seq++;
2350 t=del_timer(&sk->timer);
2351 if(t)
2352 add_timer(&sk->timer);
2353 else2354 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2355 return;
2356 }2357
2358 /*2359 * We ought to check if the end of the queue is a buffer and2360 * if so simply add the fin to that buffer, not send it ahead.2361 */2362
2363 t1 =(structtcphdr *)((char *)t1 +tmp);
2364 buff->len += tmp;
2365 buff->dev = dev;
2366 memcpy(t1, th, sizeof(*t1));
2367 t1->seq = ntohl(sk->write_seq);
2368 sk->write_seq++;
2369 buff->h.seq = sk->write_seq;
2370 t1->ack = 1;
2371 t1->ack_seq = ntohl(sk->acked_seq);
2372 t1->window = ntohs(sk->window=tcp_select_window(sk));
2373 t1->fin = 1;
2374 t1->rst = 0;
2375 t1->doff = sizeof(*t1)/4;
2376 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2377
2378 /*2379 * If there is data in the write queue, the fin must be appended to2380 * the write queue.2381 */2382
2383 if (skb_peek(&sk->write_queue) != NULL)
2384 {2385 buff->free = 0;
2386 if (buff->next != NULL)
2387 {2388 printk("tcp_send_fin: next != NULL\n");
2389 skb_unlink(buff);
2390 }2391 skb_queue_tail(&sk->write_queue, buff);
2392 }2393 else2394 {2395 sk->sent_seq = sk->write_seq;
2396 sk->prot->queue_xmit(sk, dev, buff, 0);
2397 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2398 }2399 }2400
2401 /*2402 * Shutdown the sending side of a connection. Much like close except2403 * that we don't receive shut down or set sk->dead=1.2404 */2405
2406 voidtcp_shutdown(structsock *sk, inthow)
/* */2407 {2408 /*2409 * We need to grab some memory, and put together a FIN,2410 * and then put it into the queue to be sent.2411 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2412 */2413
2414 if (!(how & SEND_SHUTDOWN))
2415 return;
2416
2417 /*2418 * If we've already sent a FIN, or its a closed state2419 */2420
2421 if (sk->state == TCP_FIN_WAIT1 ||
2422 sk->state == TCP_FIN_WAIT2 ||
2423 sk->state == TCP_CLOSING ||
2424 sk->state == TCP_LAST_ACK ||
2425 sk->state == TCP_TIME_WAIT ||
2426 sk->state == TCP_CLOSE ||
2427 sk->state == TCP_LISTEN2428 )
2429 {2430 return;
2431 }2432 sk->inuse = 1;
2433
2434 /*2435 * flag that the sender has shutdown2436 */2437
2438 sk->shutdown |= SEND_SHUTDOWN;
2439
2440 /*2441 * Clear out any half completed packets. 2442 */2443
2444 if (sk->partial)
2445 tcp_send_partial(sk);
2446
2447 /*2448 * FIN if needed2449 */2450
2451 if(tcp_close_state(sk,0))
2452 tcp_send_fin(sk);
2453
2454 release_sock(sk);
2455 }2456
2457
2458 staticint2459 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2460 intto_len, intnonblock, unsignedflags,
2461 structsockaddr_in *addr, int *addr_len)
2462 {2463 intresult;
2464
2465 /* 2466 * Have to check these first unlike the old code. If 2467 * we check them after we lose data on an error2468 * which is wrong 2469 */2470
2471 if(addr_len)
2472 *addr_len = sizeof(*addr);
2473 result=tcp_read(sk, to, to_len, nonblock, flags);
2474
2475 if (result < 0)
2476 return(result);
2477
2478 if(addr)
2479 {2480 addr->sin_family = AF_INET;
2481 addr->sin_port = sk->dummy_th.dest;
2482 addr->sin_addr.s_addr = sk->daddr;
2483 }2484 return(result);
2485 }2486
2487
2488 /*2489 * This routine will send an RST to the other tcp. 2490 */2491
2492 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2493 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2494 {2495 structsk_buff *buff;
2496 structtcphdr *t1;
2497 inttmp;
2498 structdevice *ndev=NULL;
2499
2500 /*2501 * Cannot reset a reset (Think about it).2502 */2503
2504 if(th->rst)
2505 return;
2506
2507 /*2508 * We need to grab some memory, and put together an RST,2509 * and then put it into the queue to be sent.2510 */2511
2512 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2513 if (buff == NULL)
2514 return;
2515
2516 buff->len = sizeof(*t1);
2517 buff->sk = NULL;
2518 buff->dev = dev;
2519 buff->localroute = 0;
2520
2521 t1 =(structtcphdr *) buff->data;
2522
2523 /*2524 * Put in the IP header and routing stuff. 2525 */2526
2527 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2528 sizeof(structtcphdr),tos,ttl);
2529 if (tmp < 0)
2530 {2531 buff->free = 1;
2532 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2533 return;
2534 }2535
2536 t1 =(structtcphdr *)((char *)t1 +tmp);
2537 buff->len += tmp;
2538 memcpy(t1, th, sizeof(*t1));
2539
2540 /*2541 * Swap the send and the receive. 2542 */2543
2544 t1->dest = th->source;
2545 t1->source = th->dest;
2546 t1->rst = 1;
2547 t1->window = 0;
2548
2549 if(th->ack)
2550 {2551 t1->ack = 0;
2552 t1->seq = th->ack_seq;
2553 t1->ack_seq = 0;
2554 }2555 else2556 {2557 t1->ack = 1;
2558 if(!th->syn)
2559 t1->ack_seq=htonl(th->seq);
2560 else2561 t1->ack_seq=htonl(th->seq+1);
2562 t1->seq=0;
2563 }2564
2565 t1->syn = 0;
2566 t1->urg = 0;
2567 t1->fin = 0;
2568 t1->psh = 0;
2569 t1->doff = sizeof(*t1)/4;
2570 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2571 prot->queue_xmit(NULL, ndev, buff, 1);
2572 tcp_statistics.TcpOutSegs++;
2573 }2574
2575
2576 /*2577 * Look for tcp options. Parses everything but only knows about MSS.2578 * This routine is always called with the packet containing the SYN.2579 * However it may also be called with the ack to the SYN. So you2580 * can't assume this is always the SYN. It's always called after2581 * we have set up sk->mtu to our own MTU.2582 *2583 * We need at minimum to add PAWS support here. Possibly large windows2584 * as Linux gets deployed on 100Mb/sec networks.2585 */2586
2587 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2588 {2589 unsignedchar *ptr;
2590 intlength=(th->doff*4)-sizeof(structtcphdr);
2591 intmss_seen = 0;
2592
2593 ptr = (unsignedchar *)(th + 1);
2594
2595 while(length>0)
2596 {2597 intopcode=*ptr++;
2598 intopsize=*ptr++;
2599 switch(opcode)
2600 {2601 caseTCPOPT_EOL:
2602 return;
2603 caseTCPOPT_NOP:
2604 length-=2;
2605 continue;
2606
2607 default:
2608 if(opsize<=2) /* Avoid silly options looping forever */2609 return;
2610 switch(opcode)
2611 {2612 caseTCPOPT_MSS:
2613 if(opsize==4 && th->syn)
2614 {2615 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2616 mss_seen = 1;
2617 }2618 break;
2619 /* Add other options here as people feel the urge to implement stuff like large windows */2620 }2621 ptr+=opsize-2;
2622 length-=opsize;
2623 }2624 }2625 if (th->syn)
2626 {2627 if (! mss_seen)
2628 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2629 }2630 #ifdefCONFIG_INET_PCTCP2631 sk->mss = min(sk->max_window >> 1, sk->mtu);
2632 #else2633 sk->mss = min(sk->max_window, sk->mtu);
2634 #endif2635 }2636
2637 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2638 {2639 dst = ntohl(dst);
2640 if (IN_CLASSA(dst))
2641 returnhtonl(IN_CLASSA_NET);
2642 if (IN_CLASSB(dst))
2643 returnhtonl(IN_CLASSB_NET);
2644 returnhtonl(IN_CLASSC_NET);
2645 }2646
2647 /*2648 * Default sequence number picking algorithm.2649 */2650
2651 externinlinelongtcp_init_seq(void)
/* */2652 {2653 returnjiffies * SEQ_TICK - seq_offset;
2654 }2655
2656 /*2657 * This routine handles a connection request.2658 * It should make sure we haven't already responded.2659 * Because of the way BSD works, we have to send a syn/ack now.2660 * This also means it will be harder to close a socket which is2661 * listening.2662 */2663
2664 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2665 unsignedlongdaddr, unsignedlongsaddr,
2666 structoptions *opt, structdevice *dev, unsignedlongseq)
2667 {2668 structsk_buff *buff;
2669 structtcphdr *t1;
2670 unsignedchar *ptr;
2671 structsock *newsk;
2672 structtcphdr *th;
2673 structdevice *ndev=NULL;
2674 inttmp;
2675 structrtable *rt;
2676
2677 th = skb->h.th;
2678
2679 /* If the socket is dead, don't accept the connection. */2680 if (!sk->dead)
2681 {2682 sk->data_ready(sk,0);
2683 }2684 else2685 {2686 if(sk->debug)
2687 printk("Reset on %p: Connect on dead socket.\n",sk);
2688 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2689 tcp_statistics.TcpAttemptFails++;
2690 kfree_skb(skb, FREE_READ);
2691 return;
2692 }2693
2694 /*2695 * Make sure we can accept more. This will prevent a2696 * flurry of syns from eating up all our memory.2697 */2698
2699 if (sk->ack_backlog >= sk->max_ack_backlog)
2700 {2701 tcp_statistics.TcpAttemptFails++;
2702 kfree_skb(skb, FREE_READ);
2703 return;
2704 }2705
2706 /*2707 * We need to build a new sock struct.2708 * It is sort of bad to have a socket without an inode attached2709 * to it, but the wake_up's will just wake up the listening socket,2710 * and if the listening socket is destroyed before this is taken2711 * off of the queue, this will take care of it.2712 */2713
2714 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2715 if (newsk == NULL)
2716 {2717 /* just ignore the syn. It will get retransmitted. */2718 tcp_statistics.TcpAttemptFails++;
2719 kfree_skb(skb, FREE_READ);
2720 return;
2721 }2722
2723 memcpy(newsk, sk, sizeof(*newsk));
2724 skb_queue_head_init(&newsk->write_queue);
2725 skb_queue_head_init(&newsk->receive_queue);
2726 newsk->send_head = NULL;
2727 newsk->send_tail = NULL;
2728 skb_queue_head_init(&newsk->back_log);
2729 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2730 newsk->rto = TCP_TIMEOUT_INIT;
2731 newsk->mdev = 0;
2732 newsk->max_window = 0;
2733 newsk->cong_window = 1;
2734 newsk->cong_count = 0;
2735 newsk->ssthresh = 0;
2736 newsk->backoff = 0;
2737 newsk->blog = 0;
2738 newsk->intr = 0;
2739 newsk->proc = 0;
2740 newsk->done = 0;
2741 newsk->partial = NULL;
2742 newsk->pair = NULL;
2743 newsk->wmem_alloc = 0;
2744 newsk->rmem_alloc = 0;
2745 newsk->localroute = sk->localroute;
2746
2747 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2748
2749 newsk->err = 0;
2750 newsk->shutdown = 0;
2751 newsk->ack_backlog = 0;
2752 newsk->acked_seq = skb->h.th->seq+1;
2753 newsk->copied_seq = skb->h.th->seq+1;
2754 newsk->fin_seq = skb->h.th->seq;
2755 newsk->state = TCP_SYN_RECV;
2756 newsk->timeout = 0;
2757 newsk->ip_xmit_timeout = 0;
2758 newsk->write_seq = seq;
2759 newsk->window_seq = newsk->write_seq;
2760 newsk->rcv_ack_seq = newsk->write_seq;
2761 newsk->urg_data = 0;
2762 newsk->retransmits = 0;
2763 newsk->linger=0;
2764 newsk->destroy = 0;
2765 init_timer(&newsk->timer);
2766 init_timer(&newsk->retransmit_timer);
2767 newsk->timer.data = (unsignedlong)newsk;
2768 newsk->timer.function = &net_timer;
2769 newsk->retransmit_timer.data = (unsignedlong)newsk;
2770 newsk->retransmit_timer.function=&retransmit_timer;
2771 newsk->dummy_th.source = skb->h.th->dest;
2772 newsk->dummy_th.dest = skb->h.th->source;
2773
2774 /*2775 * Swap these two, they are from our point of view. 2776 */2777
2778 newsk->daddr = saddr;
2779 newsk->saddr = daddr;
2780
2781 put_sock(newsk->num,newsk);
2782 newsk->dummy_th.res1 = 0;
2783 newsk->dummy_th.doff = 6;
2784 newsk->dummy_th.fin = 0;
2785 newsk->dummy_th.syn = 0;
2786 newsk->dummy_th.rst = 0;
2787 newsk->dummy_th.psh = 0;
2788 newsk->dummy_th.ack = 0;
2789 newsk->dummy_th.urg = 0;
2790 newsk->dummy_th.res2 = 0;
2791 newsk->acked_seq = skb->h.th->seq + 1;
2792 newsk->copied_seq = skb->h.th->seq + 1;
2793 newsk->socket = NULL;
2794
2795 /*2796 * Grab the ttl and tos values and use them 2797 */2798
2799 newsk->ip_ttl=sk->ip_ttl;
2800 newsk->ip_tos=skb->ip_hdr->tos;
2801
2802 /*2803 * Use 512 or whatever user asked for 2804 */2805
2806 /*2807 * Note use of sk->user_mss, since user has no direct access to newsk 2808 */2809
2810 rt=ip_rt_route(saddr, NULL,NULL);
2811
2812 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2813 newsk->window_clamp = rt->rt_window;
2814 else2815 newsk->window_clamp = 0;
2816
2817 if (sk->user_mss)
2818 newsk->mtu = sk->user_mss;
2819 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
2820 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2821 else2822 {2823 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */2824 if ((saddr ^ daddr) & default_mask(saddr))
2825 #else2826 if ((saddr ^ daddr) & dev->pa_mask)
2827 #endif2828 newsk->mtu = 576 - HEADER_SIZE;
2829 else2830 newsk->mtu = MAX_WINDOW;
2831 }2832
2833 /*2834 * But not bigger than device MTU 2835 */2836
2837 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2838
2839 /*2840 * This will min with what arrived in the packet 2841 */2842
2843 tcp_options(newsk,skb->h.th);
2844
2845 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2846 if (buff == NULL)
2847 {2848 sk->err = -ENOMEM;
2849 newsk->dead = 1;
2850 release_sock(newsk);
2851 kfree_skb(skb, FREE_READ);
2852 tcp_statistics.TcpAttemptFails++;
2853 return;
2854 }2855
2856 buff->len = sizeof(structtcphdr)+4;
2857 buff->sk = newsk;
2858 buff->localroute = newsk->localroute;
2859
2860 t1 =(structtcphdr *) buff->data;
2861
2862 /*2863 * Put in the IP header and routing stuff. 2864 */2865
2866 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2867 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2868
2869 /*2870 * Something went wrong. 2871 */2872
2873 if (tmp < 0)
2874 {2875 sk->err = tmp;
2876 buff->free = 1;
2877 kfree_skb(buff,FREE_WRITE);
2878 newsk->dead = 1;
2879 release_sock(newsk);
2880 skb->sk = sk;
2881 kfree_skb(skb, FREE_READ);
2882 tcp_statistics.TcpAttemptFails++;
2883 return;
2884 }2885
2886 buff->len += tmp;
2887 t1 =(structtcphdr *)((char *)t1 +tmp);
2888
2889 memcpy(t1, skb->h.th, sizeof(*t1));
2890 buff->h.seq = newsk->write_seq;
2891 /*2892 * Swap the send and the receive. 2893 */2894 t1->dest = skb->h.th->source;
2895 t1->source = newsk->dummy_th.source;
2896 t1->seq = ntohl(newsk->write_seq++);
2897 t1->ack = 1;
2898 newsk->window = tcp_select_window(newsk);
2899 newsk->sent_seq = newsk->write_seq;
2900 t1->window = ntohs(newsk->window);
2901 t1->res1 = 0;
2902 t1->res2 = 0;
2903 t1->rst = 0;
2904 t1->urg = 0;
2905 t1->psh = 0;
2906 t1->syn = 1;
2907 t1->ack_seq = ntohl(skb->h.th->seq+1);
2908 t1->doff = sizeof(*t1)/4+1;
2909 ptr =(unsignedchar *)(t1+1);
2910 ptr[0] = 2;
2911 ptr[1] = 4;
2912 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2913 ptr[3] =(newsk->mtu) & 0xff;
2914
2915 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2916 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2917 reset_xmit_timer(newsk, TIME_WRITE, newsk->rto);
2918
2919 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2920 skb->sk = newsk;
2921
2922 /*2923 * Charge the sock_buff to newsk. 2924 */2925
2926 sk->rmem_alloc -= skb->mem_len;
2927 newsk->rmem_alloc += skb->mem_len;
2928
2929 skb_queue_tail(&sk->receive_queue,skb);
2930 sk->ack_backlog++;
2931 release_sock(newsk);
2932 tcp_statistics.TcpOutSegs++;
2933 }2934
2935
2936 staticvoidtcp_close(structsock *sk, inttimeout)
/* */2937 {2938 /*2939 * We need to grab some memory, and put together a FIN, 2940 * and then put it into the queue to be sent.2941 */2942
2943 sk->inuse = 1;
2944
2945 if(sk->state == TCP_LISTEN)
2946 {2947 /* Special case */2948 tcp_set_state(sk, TCP_CLOSE);
2949 tcp_close_pending(sk);
2950 release_sock(sk);
2951 return;
2952 }2953
2954 sk->keepopen = 1;
2955 sk->shutdown = SHUTDOWN_MASK;
2956
2957 if (!sk->dead)
2958 sk->state_change(sk);
2959
2960 if (timeout == 0)
2961 {2962 structsk_buff *skb;
2963
2964 /*2965 * We need to flush the recv. buffs. We do this only on the2966 * descriptor close, not protocol-sourced closes, because the2967 * reader process may not have drained the data yet!2968 */2969
2970 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2971 kfree_skb(skb, FREE_READ);
2972 /*2973 * Get rid off any half-completed packets. 2974 */2975
2976 if (sk->partial)
2977 tcp_send_partial(sk);
2978 }2979
2980
2981 /*2982 * Timeout is not the same thing - however the code likes2983 * to send both the same way (sigh).2984 */2985
2986 if(timeout)
2987 {2988 tcp_set_state(sk, TCP_CLOSE); /* Dead */2989 }2990 else2991 {2992 if(tcp_close_state(sk,1)==1)
2993 {2994 tcp_send_fin(sk);
2995 }2996 }2997 release_sock(sk);
2998 }2999
3000
3001 /*3002 * This routine takes stuff off of the write queue,3003 * and puts it in the xmit queue. This happens as incoming acks3004 * open up the remote window for us.3005 */3006
3007 staticvoidtcp_write_xmit(structsock *sk)
/* */3008 {3009 structsk_buff *skb;
3010
3011 /*3012 * The bytes will have to remain here. In time closedown will3013 * empty the write queue and all will be happy 3014 */3015
3016 if(sk->zapped)
3017 return;
3018
3019 /*3020 * Anything on the transmit queue that fits the window can3021 * be added providing we are not3022 *3023 * a) retransmitting (Nagle's rule)3024 * b) exceeding our congestion window.3025 */3026
3027 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3028 before(skb->h.seq, sk->window_seq + 1) &&
3029 (sk->retransmits == 0 ||
3030 sk->ip_xmit_timeout != TIME_WRITE ||
3031 before(skb->h.seq, sk->rcv_ack_seq + 1))
3032 && sk->packets_out < sk->cong_window)
3033 {3034 IS_SKB(skb);
3035 skb_unlink(skb);
3036
3037 /*3038 * See if we really need to send the packet. 3039 */3040
3041 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3042 {3043 /*3044 * This is acked data. We can discard it. This 3045 * cannot currently occur.3046 */3047
3048 sk->retransmits = 0;
3049 kfree_skb(skb, FREE_WRITE);
3050 if (!sk->dead)
3051 sk->write_space(sk);
3052 }3053 else3054 {3055 structtcphdr *th;
3056 structiphdr *iph;
3057 intsize;
3058 /*3059 * put in the ack seq and window at this point rather than earlier,3060 * in order to keep them monotonic. We really want to avoid taking3061 * back window allocations. That's legal, but RFC1122 says it's frowned on.3062 * Ack and window will in general have changed since this packet was put3063 * on the write queue.3064 */3065 iph = (structiphdr *)(skb->data +
3066 skb->dev->hard_header_len);
3067 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3068 size = skb->len - (((unsignedchar *) th) - skb->data);
3069
3070 th->ack_seq = ntohl(sk->acked_seq);
3071 th->window = ntohs(tcp_select_window(sk));
3072
3073 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3074
3075 sk->sent_seq = skb->h.seq;
3076
3077 /*3078 * IP manages our queue for some crazy reason3079 */3080
3081 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3082
3083 /*3084 * Again we slide the timer wrongly3085 */3086
3087 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3088 }3089 }3090 }3091
3092
3093 /*3094 * This routine deals with incoming acks, but not outgoing ones.3095 */3096
3097 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3098 {3099 unsignedlongack;
3100 intflag = 0;
3101
3102 /* 3103 * 1 - there was data in packet as well as ack or new data is sent or 3104 * in shutdown state3105 * 2 - data from retransmit queue was acked and removed3106 * 4 - window shrunk or data from retransmit queue was acked and removed3107 */3108
3109 if(sk->zapped)
3110 return(1); /* Dead, cant ack any more so why bother */3111
3112 /*3113 * Have we discovered a larger window3114 */3115
3116 ack = ntohl(th->ack_seq);
3117
3118 if (ntohs(th->window) > sk->max_window)
3119 {3120 sk->max_window = ntohs(th->window);
3121 #ifdefCONFIG_INET_PCTCP3122 /* Hack because we don't send partial packets to non SWS3123 handling hosts */3124 sk->mss = min(sk->max_window>>1, sk->mtu);
3125 #else3126 sk->mss = min(sk->max_window, sk->mtu);
3127 #endif3128 }3129
3130 /*3131 * We have dropped back to keepalive timeouts. Thus we have3132 * no retransmits pending.3133 */3134
3135 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3136 sk->retransmits = 0;
3137
3138 /*3139 * If the ack is newer than sent or older than previous acks3140 * then we can probably ignore it.3141 */3142
3143 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3144 {3145 if(sk->debug)
3146 printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3147
3148 /*3149 * Keepalive processing.3150 */3151
3152 if (after(ack, sk->sent_seq))
3153 {3154 return(0);
3155 }3156
3157 /*3158 * Restart the keepalive timer.3159 */3160
3161 if (sk->keepopen)
3162 {3163 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3164 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3165 }3166 return(1);
3167 }3168
3169 /*3170 * If there is data set flag 13171 */3172
3173 if (len != th->doff*4)
3174 flag |= 1;
3175
3176 /*3177 * See if our window has been shrunk. 3178 */3179
3180 if (after(sk->window_seq, ack+ntohs(th->window)))
3181 {3182 /*3183 * We may need to move packets from the send queue3184 * to the write queue, if the window has been shrunk on us.3185 * The RFC says you are not allowed to shrink your window3186 * like this, but if the other end does, you must be able3187 * to deal with it.3188 */3189 structsk_buff *skb;
3190 structsk_buff *skb2;
3191 structsk_buff *wskb = NULL;
3192
3193 skb2 = sk->send_head;
3194 sk->send_head = NULL;
3195 sk->send_tail = NULL;
3196
3197 /*3198 * This is an artifact of a flawed concept. We want one3199 * queue and a smarter send routine when we send all.3200 */3201
3202 flag |= 4; /* Window changed */3203
3204 sk->window_seq = ack + ntohs(th->window);
3205 cli();
3206 while (skb2 != NULL)
3207 {3208 skb = skb2;
3209 skb2 = skb->link3;
3210 skb->link3 = NULL;
3211 if (after(skb->h.seq, sk->window_seq))
3212 {3213 if (sk->packets_out > 0)
3214 sk->packets_out--;
3215 /* We may need to remove this from the dev send list. */3216 if (skb->next != NULL)
3217 {3218 skb_unlink(skb);
3219 }3220 /* Now add it to the write_queue. */3221 if (wskb == NULL)
3222 skb_queue_head(&sk->write_queue,skb);
3223 else3224 skb_append(wskb,skb);
3225 wskb = skb;
3226 }3227 else3228 {3229 if (sk->send_head == NULL)
3230 {3231 sk->send_head = skb;
3232 sk->send_tail = skb;
3233 }3234 else3235 {3236 sk->send_tail->link3 = skb;
3237 sk->send_tail = skb;
3238 }3239 skb->link3 = NULL;
3240 }3241 }3242 sti();
3243 }3244
3245 /*3246 * Pipe has emptied3247 */3248
3249 if (sk->send_tail == NULL || sk->send_head == NULL)
3250 {3251 sk->send_head = NULL;
3252 sk->send_tail = NULL;
3253 sk->packets_out= 0;
3254 }3255
3256 /*3257 * Update the right hand window edge of the host3258 */3259
3260 sk->window_seq = ack + ntohs(th->window);
3261
3262 /*3263 * We don't want too many packets out there. 3264 */3265
3266 if (sk->ip_xmit_timeout == TIME_WRITE &&
3267 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3268 {3269 /* 3270 * This is Jacobson's slow start and congestion avoidance. 3271 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3272 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3273 * counter and increment it once every cwnd times. It's possible3274 * that this should be done only if sk->retransmits == 0. I'm3275 * interpreting "new data is acked" as including data that has3276 * been retransmitted but is just now being acked.3277 */3278 if (sk->cong_window < sk->ssthresh)
3279 /* 3280 * In "safe" area, increase3281 */3282 sk->cong_window++;
3283 else3284 {3285 /*3286 * In dangerous area, increase slowly. In theory this is3287 * sk->cong_window += 1 / sk->cong_window3288 */3289 if (sk->cong_count >= sk->cong_window)
3290 {3291 sk->cong_window++;
3292 sk->cong_count = 0;
3293 }3294 else3295 sk->cong_count++;
3296 }3297 }3298
3299 /*3300 * Remember the highest ack received.3301 */3302
3303 sk->rcv_ack_seq = ack;
3304
3305 /*3306 * If this ack opens up a zero window, clear backoff. It was3307 * being used to time the probes, and is probably far higher than3308 * it needs to be for normal retransmission.3309 */3310
3311 if (sk->ip_xmit_timeout == TIME_PROBE0)
3312 {3313 sk->retransmits = 0; /* Our probe was answered */3314
3315 /*3316 * Was it a usable window open ?3317 */3318
3319 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3320 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3321 {3322 sk->backoff = 0;
3323
3324 /*3325 * Recompute rto from rtt. this eliminates any backoff.3326 */3327
3328 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3329 if (sk->rto > 120*HZ)
3330 sk->rto = 120*HZ;
3331 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3332 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3333 .2 of a second is going to need huge windows (SIGH) */3334 sk->rto = 20;
3335 }3336 }3337
3338 /* 3339 * See if we can take anything off of the retransmit queue.3340 */3341
3342 while(sk->send_head != NULL)
3343 {3344 /* Check for a bug. */3345 if (sk->send_head->link3 &&
3346 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3347 printk("INET: tcp.c: *** bug send_list out of order.\n");
3348
3349 /*3350 * If our packet is before the ack sequence we can3351 * discard it as its confirmed to have arrived the other end.3352 */3353
3354 if (before(sk->send_head->h.seq, ack+1))
3355 {3356 structsk_buff *oskb;
3357 if (sk->retransmits)
3358 {3359 /*3360 * We were retransmitting. don't count this in RTT est 3361 */3362 flag |= 2;
3363
3364 /*3365 * even though we've gotten an ack, we're still3366 * retransmitting as long as we're sending from3367 * the retransmit queue. Keeping retransmits non-zero3368 * prevents us from getting new data interspersed with3369 * retransmissions.3370 */3371
3372 if (sk->send_head->link3) /* Any more queued retransmits? */3373 sk->retransmits = 1;
3374 else3375 sk->retransmits = 0;
3376 }3377 /*3378 * Note that we only reset backoff and rto in the3379 * rtt recomputation code. And that doesn't happen3380 * if there were retransmissions in effect. So the3381 * first new packet after the retransmissions is3382 * sent with the backoff still in effect. Not until3383 * we get an ack from a non-retransmitted packet do3384 * we reset the backoff and rto. This allows us to deal3385 * with a situation where the network delay has increased3386 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3387 */3388
3389 /*3390 * We have one less packet out there. 3391 */3392
3393 if (sk->packets_out > 0)
3394 sk->packets_out --;
3395 /* 3396 * Wake up the process, it can probably write more. 3397 */3398 if (!sk->dead)
3399 sk->write_space(sk);
3400 oskb = sk->send_head;
3401
3402 if (!(flag&2)) /* Not retransmitting */3403 {3404 longm;
3405
3406 /*3407 * The following amusing code comes from Jacobson's3408 * article in SIGCOMM '88. Note that rtt and mdev3409 * are scaled versions of rtt and mean deviation.3410 * This is designed to be as fast as possible 3411 * m stands for "measurement".3412 */3413
3414 m = jiffies - oskb->when; /* RTT */3415 if(m<=0)
3416 m=1; /* IS THIS RIGHT FOR <0 ??? */3417 m -= (sk->rtt >> 3); /* m is now error in rtt est */3418 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3419 if (m < 0)
3420 m = -m; /* m is now abs(error) */3421 m -= (sk->mdev >> 2); /* similar update on mdev */3422 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3423
3424 /*3425 * Now update timeout. Note that this removes any backoff.3426 */3427
3428 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3429 if (sk->rto > 120*HZ)
3430 sk->rto = 120*HZ;
3431 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3432 sk->rto = 20;
3433 sk->backoff = 0;
3434 }3435 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3436 In this case as we just set it up */3437 cli();
3438 oskb = sk->send_head;
3439 IS_SKB(oskb);
3440 sk->send_head = oskb->link3;
3441 if (sk->send_head == NULL)
3442 {3443 sk->send_tail = NULL;
3444 }3445
3446 /*3447 * We may need to remove this from the dev send list. 3448 */3449
3450 if (oskb->next)
3451 skb_unlink(oskb);
3452 sti();
3453 kfree_skb(oskb, FREE_WRITE); /* write. */3454 if (!sk->dead)
3455 sk->write_space(sk);
3456 }3457 else3458 {3459 break;
3460 }3461 }3462
3463 /*3464 * XXX someone ought to look at this too.. at the moment, if skb_peek()3465 * returns non-NULL, we complete ignore the timer stuff in the else3466 * clause. We ought to organize the code so that else clause can3467 * (should) be executed regardless, possibly moving the PROBE timer3468 * reset over. The skb_peek() thing should only move stuff to the3469 * write queue, NOT also manage the timer functions.3470 */3471
3472 /*3473 * Maybe we can take some stuff off of the write queue,3474 * and put it onto the xmit queue.3475 */3476 if (skb_peek(&sk->write_queue) != NULL)
3477 {3478 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3479 (sk->retransmits == 0 ||
3480 sk->ip_xmit_timeout != TIME_WRITE ||
3481 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3482 && sk->packets_out < sk->cong_window)
3483 {3484 /*3485 * Add more data to the send queue.3486 */3487 flag |= 1;
3488 tcp_write_xmit(sk);
3489 }3490 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3491 sk->send_head == NULL &&
3492 sk->ack_backlog == 0 &&
3493 sk->state != TCP_TIME_WAIT)
3494 {3495 /*3496 * Data to queue but no room.3497 */3498 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3499 }3500 }3501 else3502 {3503 /*3504 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3505 * from TCP_CLOSE we don't do anything3506 *3507 * from anything else, if there is write data (or fin) pending,3508 * we use a TIME_WRITE timeout, else if keepalive we reset to3509 * a KEEPALIVE timeout, else we delete the timer.3510 *3511 * We do not set flag for nominal write data, otherwise we may3512 * force a state where we start to write itsy bitsy tidbits3513 * of data.3514 */3515
3516 switch(sk->state) {3517 caseTCP_TIME_WAIT:
3518 /*3519 * keep us in TIME_WAIT until we stop getting packets,3520 * reset the timeout.3521 */3522 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3523 break;
3524 caseTCP_CLOSE:
3525 /*3526 * don't touch the timer.3527 */3528 break;
3529 default:
3530 /*3531 * Must check send_head, write_queue, and ack_backlog3532 * to determine which timeout to use.3533 */3534 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3535 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3536 }elseif (sk->keepopen) {3537 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3538 }else{3539 del_timer(&sk->retransmit_timer);
3540 sk->ip_xmit_timeout = 0;
3541 }3542 break;
3543 }3544 }3545
3546 /*3547 * We have nothing queued but space to send. Send any partial3548 * packets immediately (end of Nagle rule application).3549 */3550
3551 if (sk->packets_out == 0 && sk->partial != NULL &&
3552 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3553 {3554 flag |= 1;
3555 tcp_send_partial(sk);
3556 }3557
3558 /*3559 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3560 * we are now waiting for an acknowledge to our FIN. The other end is3561 * already in TIME_WAIT.3562 *3563 * Move to TCP_CLOSE on success.3564 */3565
3566 if (sk->state == TCP_LAST_ACK)
3567 {3568 if (!sk->dead)
3569 sk->state_change(sk);
3570 if(sk->debug)
3571 printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3572 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3573 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3574 {3575 flag |= 1;
3576 tcp_set_state(sk,TCP_CLOSE);
3577 sk->shutdown = SHUTDOWN_MASK;
3578 }3579 }3580
3581 /*3582 * Incoming ACK to a FIN we sent in the case of our initiating the close.3583 *3584 * Move to FIN_WAIT2 to await a FIN from the other end. Set3585 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3586 */3587
3588 if (sk->state == TCP_FIN_WAIT1)
3589 {3590
3591 if (!sk->dead)
3592 sk->state_change(sk);
3593 if (sk->rcv_ack_seq == sk->write_seq)
3594 {3595 flag |= 1;
3596 sk->shutdown |= SEND_SHUTDOWN;
3597 tcp_set_state(sk, TCP_FIN_WAIT2);
3598 }3599 }3600
3601 /*3602 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3603 *3604 * Move to TIME_WAIT3605 */3606
3607 if (sk->state == TCP_CLOSING)
3608 {3609
3610 if (!sk->dead)
3611 sk->state_change(sk);
3612 if (sk->rcv_ack_seq == sk->write_seq)
3613 {3614 flag |= 1;
3615 tcp_time_wait(sk);
3616 }3617 }3618
3619 /*3620 * Final ack of a three way shake 3621 */3622
3623 if(sk->state==TCP_SYN_RECV)
3624 {3625 tcp_set_state(sk, TCP_ESTABLISHED);
3626 tcp_options(sk,th);
3627 sk->dummy_th.dest=th->source;
3628 sk->copied_seq = sk->acked_seq;
3629 if(!sk->dead)
3630 sk->state_change(sk);
3631 if(sk->max_window==0)
3632 {3633 sk->max_window=32; /* Sanity check */3634 sk->mss=min(sk->max_window,sk->mtu);
3635 }3636 }3637
3638 /*3639 * I make no guarantees about the first clause in the following3640 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3641 * what conditions "!flag" would be true. However I think the rest3642 * of the conditions would prevent that from causing any3643 * unnecessary retransmission. 3644 * Clearly if the first packet has expired it should be 3645 * retransmitted. The other alternative, "flag&2 && retransmits", is3646 * harder to explain: You have to look carefully at how and when the3647 * timer is set and with what timeout. The most recent transmission always3648 * sets the timer. So in general if the most recent thing has timed3649 * out, everything before it has as well. So we want to go ahead and3650 * retransmit some more. If we didn't explicitly test for this3651 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3652 * would not be true. If you look at the pattern of timing, you can3653 * show that rto is increased fast enough that the next packet would3654 * almost never be retransmitted immediately. Then you'd end up3655 * waiting for a timeout to send each packet on the retransmission3656 * queue. With my implementation of the Karn sampling algorithm,3657 * the timeout would double each time. The net result is that it would3658 * take a hideous amount of time to recover from a single dropped packet.3659 * It's possible that there should also be a test for TIME_WRITE, but3660 * I think as long as "send_head != NULL" and "retransmit" is on, we've3661 * got to be in real retransmission mode.3662 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3663 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3664 * As long as no further losses occur, this seems reasonable.3665 */3666
3667 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3668 (((flag&2) && sk->retransmits) ||
3669 (sk->send_head->when + sk->rto < jiffies)))
3670 {3671 if(sk->send_head->when + sk->rto < jiffies)
3672 tcp_retransmit(sk,0);
3673 else3674 {3675 tcp_do_retransmit(sk, 1);
3676 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3677 }3678 }3679
3680 return(1);
3681 }3682
3683
3684 /*3685 * Process the FIN bit. This now behaves as it is supposed to work3686 * and the FIN takes effect when it is validly part of sequence3687 * space. Not before when we get holes.3688 *3689 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3690 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3691 * TIME-WAIT)3692 *3693 * If we are in FINWAIT-1, a received FIN indicates simultaneous3694 * close and we go into CLOSING (and later onto TIME-WAIT)3695 *3696 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3697 *3698 */3699
3700 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3701 {3702 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3703
3704 if (!sk->dead)
3705 {3706 sk->state_change(sk);
3707 sock_wake_async(sk->socket, 1);
3708 }3709
3710 switch(sk->state)
3711 {3712 caseTCP_SYN_RECV:
3713 caseTCP_SYN_SENT:
3714 caseTCP_ESTABLISHED:
3715 /*3716 * move to CLOSE_WAIT, tcp_data() already handled3717 * sending the ack.3718 */3719 tcp_set_state(sk,TCP_CLOSE_WAIT);
3720 if (th->rst)
3721 sk->shutdown = SHUTDOWN_MASK;
3722 break;
3723
3724 caseTCP_CLOSE_WAIT:
3725 caseTCP_CLOSING:
3726 /*3727 * received a retransmission of the FIN, do3728 * nothing.3729 */3730 break;
3731 caseTCP_TIME_WAIT:
3732 /*3733 * received a retransmission of the FIN,3734 * restart the TIME_WAIT timer.3735 */3736 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3737 return(0);
3738 caseTCP_FIN_WAIT1:
3739 /*3740 * This case occurs when a simultaneous close3741 * happens, we must ack the received FIN and3742 * enter the CLOSING state.3743 *3744 * This causes a WRITE timeout, which will either3745 * move on to TIME_WAIT when we timeout, or resend3746 * the FIN properly (maybe we get rid of that annoying3747 * FIN lost hang). The TIME_WRITE code is already correct3748 * for handling this timeout.3749 */3750
3751 if(sk->ip_xmit_timeout != TIME_WRITE)
3752 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3753 tcp_set_state(sk,TCP_CLOSING);
3754 break;
3755 caseTCP_FIN_WAIT2:
3756 /*3757 * received a FIN -- send ACK and enter TIME_WAIT3758 */3759 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3760 sk->shutdown|=SHUTDOWN_MASK;
3761 tcp_set_state(sk,TCP_TIME_WAIT);
3762 break;
3763 caseTCP_CLOSE:
3764 /*3765 * already in CLOSE3766 */3767 break;
3768 default:
3769 tcp_set_state(sk,TCP_LAST_ACK);
3770
3771 /* Start the timers. */3772 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3773 return(0);
3774 }3775
3776 return(0);
3777 }3778
3779
3780
3781 /*3782 * This routine handles the data. If there is room in the buffer,3783 * it will be have already been moved into it. If there is no3784 * room, then we will just have to discard the packet.3785 */3786
3787 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */3788 unsignedlongsaddr, unsignedshortlen)
3789 {3790 structsk_buff *skb1, *skb2;
3791 structtcphdr *th;
3792 intdup_dumped=0;
3793 unsignedlongnew_seq;
3794 unsignedlongshut_seq;
3795
3796 th = skb->h.th;
3797 skb->len = len -(th->doff*4);
3798
3799 /*3800 * The bytes in the receive read/assembly queue has increased. Needed for the3801 * low memory discard algorithm 3802 */3803
3804 sk->bytes_rcv += skb->len;
3805
3806 if (skb->len == 0 && !th->fin && !th->urg && !th->psh)
3807 {3808 /* 3809 * Don't want to keep passing ack's back and forth. 3810 * (someone sent us dataless, boring frame)3811 */3812 if (!th->ack)
3813 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3814 kfree_skb(skb, FREE_READ);
3815 return(0);
3816 }3817
3818 /*3819 * We no longer have anyone receiving data on this connection.3820 */3821
3822 #ifndef TCP_DONT_RST_SHUTDOWN
3823
3824 if(sk->shutdown & RCV_SHUTDOWN)
3825 {3826 /*3827 * FIXME: BSD has some magic to avoid sending resets to3828 * broken 4.2 BSD keepalives. Much to my surprise a few non3829 * BSD stacks still have broken keepalives so we want to3830 * cope with it.3831 */3832
3833 if(skb->len) /* We don't care if its just an ack or3834 a keepalive/window probe */3835 {3836 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */3837
3838 /* Do this the way 4.4BSD treats it. Not what I'd3839 regard as the meaning of the spec but its what BSD3840 does and clearly they know everything 8) */3841
3842 /*3843 * This is valid because of two things3844 *3845 * a) The way tcp_data behaves at the bottom.3846 * b) A fin takes effect when read not when received.3847 */3848
3849 shut_seq=sk->acked_seq+1; /* Last byte */3850
3851 if(after(new_seq,shut_seq))
3852 {3853 if(sk->debug)
3854 printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3855 sk, new_seq, shut_seq, sk->blog);
3856 if(sk->dead)
3857 {3858 sk->acked_seq = new_seq + th->fin;
3859 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3860 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3861 tcp_statistics.TcpEstabResets++;
3862 tcp_set_state(sk,TCP_CLOSE);
3863 sk->err = EPIPE;
3864 sk->shutdown = SHUTDOWN_MASK;
3865 kfree_skb(skb, FREE_READ);
3866 return 0;
3867 }3868 }3869 }3870 }3871
3872 #endif3873
3874 /*3875 * Now we have to walk the chain, and figure out where this one3876 * goes into it. This is set up so that the last packet we received3877 * will be the first one we look at, that way if everything comes3878 * in order, there will be no performance loss, and if they come3879 * out of order we will be able to fit things in nicely.3880 *3881 * [AC: This is wrong. We should assume in order first and then walk3882 * forwards from the first hole based upon real traffic patterns.]3883 * 3884 */3885
3886 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */3887 {3888 skb_queue_head(&sk->receive_queue,skb);
3889 skb1= NULL;
3890 }3891 else3892 {3893 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3894 {3895 if(sk->debug)
3896 {3897 printk("skb1=%p :", skb1);
3898 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3899 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3900 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3901 sk->acked_seq);
3902 }3903
3904 /*3905 * Optimisation: Duplicate frame or extension of previous frame from3906 * same sequence point (lost ack case).3907 * The frame contains duplicate data or replaces a previous frame3908 * discard the previous frame (safe as sk->inuse is set) and put3909 * the new one in its place.3910 */3911
3912 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3913 {3914 skb_append(skb1,skb);
3915 skb_unlink(skb1);
3916 kfree_skb(skb1,FREE_READ);
3917 dup_dumped=1;
3918 skb1=NULL;
3919 break;
3920 }3921
3922 /*3923 * Found where it fits3924 */3925
3926 if (after(th->seq+1, skb1->h.th->seq))
3927 {3928 skb_append(skb1,skb);
3929 break;
3930 }3931
3932 /*3933 * See if we've hit the start. If so insert.3934 */3935 if (skb1 == skb_peek(&sk->receive_queue))
3936 {3937 skb_queue_head(&sk->receive_queue, skb);
3938 break;
3939 }3940 }3941 }3942
3943 /*3944 * Figure out what the ack value for this frame is3945 */3946
3947 th->ack_seq = th->seq + skb->len;
3948 if (th->syn)
3949 th->ack_seq++;
3950 if (th->fin)
3951 th->ack_seq++;
3952
3953 if (before(sk->acked_seq, sk->copied_seq))
3954 {3955 printk("*** tcp.c:tcp_data bug acked < copied\n");
3956 sk->acked_seq = sk->copied_seq;
3957 }3958
3959 /*3960 * Now figure out if we can ack anything. This is very messy because we really want two3961 * receive queues, a completed and an assembly queue. We also want only one transmit3962 * queue.3963 */3964
3965 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3966 {3967 if (before(th->seq, sk->acked_seq+1))
3968 {3969 intnewwindow;
3970
3971 if (after(th->ack_seq, sk->acked_seq))
3972 {3973 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3974 if (newwindow < 0)
3975 newwindow = 0;
3976 sk->window = newwindow;
3977 sk->acked_seq = th->ack_seq;
3978 }3979 skb->acked = 1;
3980
3981 /*3982 * When we ack the fin, we do the FIN 3983 * processing.3984 */3985
3986 if (skb->h.th->fin)
3987 {3988 tcp_fin(skb,sk,skb->h.th);
3989 }3990
3991 for(skb2 = skb->next;
3992 skb2 != (structsk_buff *)&sk->receive_queue;
3993 skb2 = skb2->next)
3994 {3995 if (before(skb2->h.th->seq, sk->acked_seq+1))
3996 {3997 if (after(skb2->h.th->ack_seq, sk->acked_seq))
3998 {3999 newwindow = sk->window -
4000 (skb2->h.th->ack_seq - sk->acked_seq);
4001 if (newwindow < 0)
4002 newwindow = 0;
4003 sk->window = newwindow;
4004 sk->acked_seq = skb2->h.th->ack_seq;
4005 }4006 skb2->acked = 1;
4007 /*4008 * When we ack the fin, we do4009 * the fin handling.4010 */4011 if (skb2->h.th->fin)
4012 {4013 tcp_fin(skb,sk,skb->h.th);
4014 }4015
4016 /*4017 * Force an immediate ack.4018 */4019
4020 sk->ack_backlog = sk->max_ack_backlog;
4021 }4022 else4023 {4024 break;
4025 }4026 }4027
4028 /*4029 * This also takes care of updating the window.4030 * This if statement needs to be simplified.4031 */4032 if (!sk->delay_acks ||
4033 sk->ack_backlog >= sk->max_ack_backlog ||
4034 sk->bytes_rcv > sk->max_unacked || th->fin) {4035 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4036 }4037 else4038 {4039 sk->ack_backlog++;
4040 if(sk->debug)
4041 printk("Ack queued.\n");
4042 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4043 }4044 }4045 }4046
4047 /*4048 * If we've missed a packet, send an ack.4049 * Also start a timer to send another.4050 */4051
4052 if (!skb->acked)
4053 {4054
4055 /*4056 * This is important. If we don't have much room left,4057 * we need to throw out a few packets so we have a good4058 * window. Note that mtu is used, not mss, because mss is really4059 * for the send side. He could be sending us stuff as large as mtu.4060 */4061
4062 while (sk->prot->rspace(sk) < sk->mtu)
4063 {4064 skb1 = skb_peek(&sk->receive_queue);
4065 if (skb1 == NULL)
4066 {4067 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4068 break;
4069 }4070
4071 /*4072 * Don't throw out something that has been acked. 4073 */4074
4075 if (skb1->acked)
4076 {4077 break;
4078 }4079
4080 skb_unlink(skb1);
4081 kfree_skb(skb1, FREE_READ);
4082 }4083 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4084 sk->ack_backlog++;
4085 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4086 }4087 else4088 {4089 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4090 }4091
4092 /*4093 * Now tell the user we may have some data. 4094 */4095
4096 if (!sk->dead)
4097 {4098 if(sk->debug)
4099 printk("Data wakeup.\n");
4100 sk->data_ready(sk,0);
4101 }4102 return(0);
4103 }4104
4105
4106 /*4107 * This routine is only called when we have urgent data4108 * signalled. Its the 'slow' part of tcp_urg. It could be4109 * moved inline now as tcp_urg is only called from one4110 * place. We handle URGent data wrong. We have to - as4111 * BSD still doesn't use the correction from RFC961.4112 */4113
4114 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4115 {4116 unsignedlongptr = ntohs(th->urg_ptr);
4117
4118 if (ptr)
4119 ptr--;
4120 ptr += th->seq;
4121
4122 /* ignore urgent data that we've already seen and read */4123 if (after(sk->copied_seq, ptr))
4124 return;
4125
4126 /* do we already have a newer (or duplicate) urgent pointer? */4127 if (sk->urg_data && !after(ptr, sk->urg_seq))
4128 return;
4129
4130 /* tell the world about our new urgent pointer */4131 if (sk->proc != 0) {4132 if (sk->proc > 0) {4133 kill_proc(sk->proc, SIGURG, 1);
4134 }else{4135 kill_pg(-sk->proc, SIGURG, 1);
4136 }4137 }4138 sk->urg_data = URG_NOTYET;
4139 sk->urg_seq = ptr;
4140 }4141
4142 /*4143 * This is the 'fast' part of urgent handling.4144 */4145
4146 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4147 unsignedlongsaddr, unsignedlonglen)
4148 {4149 unsignedlongptr;
4150
4151 /*4152 * Check if we get a new urgent pointer - normally not 4153 */4154
4155 if (th->urg)
4156 tcp_check_urg(sk,th);
4157
4158 /*4159 * Do we wait for any urgent data? - normally not4160 */4161
4162 if (sk->urg_data != URG_NOTYET)
4163 return 0;
4164
4165 /*4166 * Is the urgent pointer pointing into this packet? 4167 */4168
4169 ptr = sk->urg_seq - th->seq + th->doff*4;
4170 if (ptr >= len)
4171 return 0;
4172
4173 /*4174 * Ok, got the correct packet, update info 4175 */4176
4177 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4178 if (!sk->dead)
4179 sk->data_ready(sk,0);
4180 return 0;
4181 }4182
4183 /*4184 * This will accept the next outstanding connection. 4185 */4186
4187 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4188 {4189 structsock *newsk;
4190 structsk_buff *skb;
4191
4192 /*4193 * We need to make sure that this socket is listening,4194 * and that it has something pending.4195 */4196
4197 if (sk->state != TCP_LISTEN)
4198 {4199 sk->err = EINVAL;
4200 return(NULL);
4201 }4202
4203 /* Avoid the race. */4204 cli();
4205 sk->inuse = 1;
4206
4207 while((skb = tcp_dequeue_established(sk)) == NULL)
4208 {4209 if (flags & O_NONBLOCK)
4210 {4211 sti();
4212 release_sock(sk);
4213 sk->err = EAGAIN;
4214 return(NULL);
4215 }4216
4217 release_sock(sk);
4218 interruptible_sleep_on(sk->sleep);
4219 if (current->signal & ~current->blocked)
4220 {4221 sti();
4222 sk->err = ERESTARTSYS;
4223 return(NULL);
4224 }4225 sk->inuse = 1;
4226 }4227 sti();
4228
4229 /*4230 * Now all we need to do is return skb->sk. 4231 */4232
4233 newsk = skb->sk;
4234
4235 kfree_skb(skb, FREE_READ);
4236 sk->ack_backlog--;
4237 release_sock(sk);
4238 return(newsk);
4239 }4240
4241
4242 /*4243 * This will initiate an outgoing connection. 4244 */4245
4246 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4247 {4248 structsk_buff *buff;
4249 structdevice *dev=NULL;
4250 unsignedchar *ptr;
4251 inttmp;
4252 intatype;
4253 structtcphdr *t1;
4254 structrtable *rt;
4255
4256 if (sk->state != TCP_CLOSE)
4257 {4258 return(-EISCONN);
4259 }4260
4261 if (addr_len < 8)
4262 return(-EINVAL);
4263
4264 if (usin->sin_family && usin->sin_family != AF_INET)
4265 return(-EAFNOSUPPORT);
4266
4267 /*4268 * connect() to INADDR_ANY means loopback (BSD'ism).4269 */4270
4271 if(usin->sin_addr.s_addr==INADDR_ANY)
4272 usin->sin_addr.s_addr=ip_my_addr();
4273
4274 /*4275 * Don't want a TCP connection going to a broadcast address 4276 */4277
4278 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4279 return -ENETUNREACH;
4280
4281 sk->inuse = 1;
4282 sk->daddr = usin->sin_addr.s_addr;
4283 sk->write_seq = jiffies * SEQ_TICK - seq_offset;
4284 sk->window_seq = sk->write_seq;
4285 sk->rcv_ack_seq = sk->write_seq -1;
4286 sk->err = 0;
4287 sk->dummy_th.dest = usin->sin_port;
4288 release_sock(sk);
4289
4290 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4291 if (buff == NULL)
4292 {4293 return(-ENOMEM);
4294 }4295 sk->inuse = 1;
4296 buff->len = 24;
4297 buff->sk = sk;
4298 buff->free = 0;
4299 buff->localroute = sk->localroute;
4300
4301 t1 = (structtcphdr *) buff->data;
4302
4303 /*4304 * Put in the IP header and routing stuff. 4305 */4306
4307 rt=ip_rt_route(sk->daddr, NULL, NULL);
4308
4309
4310 /*4311 * We need to build the routing stuff from the things saved in skb. 4312 */4313
4314 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4315 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4316 if (tmp < 0)
4317 {4318 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4319 release_sock(sk);
4320 return(-ENETUNREACH);
4321 }4322
4323 buff->len += tmp;
4324 t1 = (structtcphdr *)((char *)t1 +tmp);
4325
4326 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4327 t1->seq = ntohl(sk->write_seq++);
4328 sk->sent_seq = sk->write_seq;
4329 buff->h.seq = sk->write_seq;
4330 t1->ack = 0;
4331 t1->window = 2;
4332 t1->res1=0;
4333 t1->res2=0;
4334 t1->rst = 0;
4335 t1->urg = 0;
4336 t1->psh = 0;
4337 t1->syn = 1;
4338 t1->urg_ptr = 0;
4339 t1->doff = 6;
4340 /* use 512 or whatever user asked for */4341
4342 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4343 sk->window_clamp=rt->rt_window;
4344 else4345 sk->window_clamp=0;
4346
4347 if (sk->user_mss)
4348 sk->mtu = sk->user_mss;
4349 elseif(rt!=NULL && (rt->rt_flags&RTF_MTU))
4350 sk->mtu = rt->rt_mss;
4351 else4352 {4353 #ifdefCONFIG_INET_SNARL4354 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4355 #else4356 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4357 #endif4358 sk->mtu = 576 - HEADER_SIZE;
4359 else4360 sk->mtu = MAX_WINDOW;
4361 }4362 /*4363 * but not bigger than device MTU 4364 */4365
4366 if(sk->mtu <32)
4367 sk->mtu = 32; /* Sanity limit */4368
4369 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4370
4371 /*4372 * Put in the TCP options to say MTU. 4373 */4374
4375 ptr = (unsignedchar *)(t1+1);
4376 ptr[0] = 2;
4377 ptr[1] = 4;
4378 ptr[2] = (sk->mtu) >> 8;
4379 ptr[3] = (sk->mtu) & 0xff;
4380 tcp_send_check(t1, sk->saddr, sk->daddr,
4381 sizeof(structtcphdr) + 4, sk);
4382
4383 /*4384 * This must go first otherwise a really quick response will get reset. 4385 */4386
4387 tcp_set_state(sk,TCP_SYN_SENT);
4388 sk->rto = TCP_TIMEOUT_INIT;
4389 init_timer(&sk->retransmit_timer);
4390 sk->retransmit_timer.function=&retransmit_timer;
4391 sk->retransmit_timer.data = (unsignedlong)sk;
4392 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4393 sk->retransmits = TCP_SYN_RETRIES;
4394
4395 sk->prot->queue_xmit(sk, dev, buff, 0);
4396 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4397 tcp_statistics.TcpActiveOpens++;
4398 tcp_statistics.TcpOutSegs++;
4399
4400 release_sock(sk);
4401 return(0);
4402 }4403
4404
4405 /* This functions checks to see if the tcp header is actually acceptable. */4406 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4407 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4408 {4409 unsignedlongnext_seq;
4410
4411 next_seq = len - 4*th->doff;
4412 if (th->fin)
4413 next_seq++;
4414 /* if we have a zero window, we can't have any data in the packet.. */4415 if (next_seq && !sk->window)
4416 gotoignore_it;
4417 next_seq += th->seq;
4418
4419 /*4420 * This isn't quite right. sk->acked_seq could be more recent4421 * than sk->window. This is however close enough. We will accept4422 * slightly more packets than we should, but it should not cause4423 * problems unless someone is trying to forge packets.4424 */4425
4426 /* have we already seen all of this packet? */4427 if (!after(next_seq+1, sk->acked_seq))
4428 gotoignore_it;
4429 /* or does it start beyond the window? */4430 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4431 gotoignore_it;
4432
4433 /* ok, at least part of this packet would seem interesting.. */4434 return 1;
4435
4436 ignore_it:
4437 if (th->rst)
4438 return 0;
4439
4440 /*4441 * Send a reset if we get something not ours and we are4442 * unsynchronized. Note: We don't do anything to our end. We4443 * are just killing the bogus remote connection then we will4444 * connect again and it will work (with luck).4445 */4446
4447 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4448 {4449 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4450 return 1;
4451 }4452
4453 /* Try to resync things. */4454 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4455 return 0;
4456 }4457
4458 /*4459 * When we get a reset we do this.4460 */4461
4462 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4463 {4464 sk->zapped = 1;
4465 sk->err = ECONNRESET;
4466 if (sk->state == TCP_SYN_SENT)
4467 sk->err = ECONNREFUSED;
4468 if (sk->state == TCP_CLOSE_WAIT)
4469 sk->err = EPIPE;
4470 #ifdef TCP_DO_RFC1337
4471 /*4472 * Time wait assassination protection [RFC1337]4473 */4474 if(sk->state!=TCP_TIME_WAIT)
4475 {4476 tcp_set_state(sk,TCP_CLOSE);
4477 sk->shutdown = SHUTDOWN_MASK;
4478 }4479 #else4480 tcp_set_state(sk,TCP_CLOSE);
4481 sk->shutdown = SHUTDOWN_MASK;
4482 #endif4483 if (!sk->dead)
4484 sk->state_change(sk);
4485 kfree_skb(skb, FREE_READ);
4486 release_sock(sk);
4487 return(0);
4488 }4489
4490 /*4491 * A TCP packet has arrived.4492 */4493
4494 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4495 unsignedlongdaddr, unsignedshortlen,
4496 unsignedlongsaddr, intredo, structinet_protocol * protocol)
4497 {4498 structtcphdr *th;
4499 structsock *sk;
4500 intsyn_ok=0;
4501
4502 if (!skb)
4503 {4504 printk("IMPOSSIBLE 1\n");
4505 return(0);
4506 }4507
4508 if (!dev)
4509 {4510 printk("IMPOSSIBLE 2\n");
4511 return(0);
4512 }4513
4514 tcp_statistics.TcpInSegs++;
4515
4516 if(skb->pkt_type!=PACKET_HOST)
4517 {4518 kfree_skb(skb,FREE_READ);
4519 return(0);
4520 }4521
4522 th = skb->h.th;
4523
4524 /*4525 * Find the socket.4526 */4527
4528 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4529
4530 /*4531 * If this socket has got a reset its to all intents and purposes 4532 * really dead. Count closed sockets as dead.4533 *4534 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4535 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4536 * exist so should cause resets as if the port was unreachable.4537 */4538
4539 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4540 sk=NULL;
4541
4542 if (!redo)
4543 {4544 if (tcp_check(th, len, saddr, daddr ))
4545 {4546 skb->sk = NULL;
4547 kfree_skb(skb,FREE_READ);
4548 /*4549 * We don't release the socket because it was4550 * never marked in use.4551 */4552 return(0);
4553 }4554 th->seq = ntohl(th->seq);
4555
4556 /* See if we know about the socket. */4557 if (sk == NULL)
4558 {4559 /*4560 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4561 */4562 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4563 skb->sk = NULL;
4564 /*4565 * Discard frame4566 */4567 kfree_skb(skb, FREE_READ);
4568 return(0);
4569 }4570
4571 skb->len = len;
4572 skb->acked = 0;
4573 skb->used = 0;
4574 skb->free = 0;
4575 skb->saddr = daddr;
4576 skb->daddr = saddr;
4577
4578 /* We may need to add it to the backlog here. */4579 cli();
4580 if (sk->inuse)
4581 {4582 skb_queue_tail(&sk->back_log, skb);
4583 sti();
4584 return(0);
4585 }4586 sk->inuse = 1;
4587 sti();
4588 }4589 else4590 {4591 if (sk==NULL)
4592 {4593 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4594 skb->sk = NULL;
4595 kfree_skb(skb, FREE_READ);
4596 return(0);
4597 }4598 }4599
4600
4601 if (!sk->prot)
4602 {4603 printk("IMPOSSIBLE 3\n");
4604 return(0);
4605 }4606
4607
4608 /*4609 * Charge the memory to the socket. 4610 */4611
4612 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
4613 {4614 kfree_skb(skb, FREE_READ);
4615 release_sock(sk);
4616 return(0);
4617 }4618
4619 skb->sk=sk;
4620 sk->rmem_alloc += skb->mem_len;
4621
4622 /*4623 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4624 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4625 * compatibility. We also set up variables more thoroughly [Karn notes in the4626 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4627 */4628
4629 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4630 {4631
4632 /*4633 * Now deal with unusual cases.4634 */4635
4636 if(sk->state==TCP_LISTEN)
4637 {4638 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4639 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4640
4641 /*4642 * We don't care for RST, and non SYN are absorbed (old segments)4643 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4644 * netmask on a running connection it can go broadcast. Even Sun's have4645 * this problem so I'm ignoring it 4646 */4647
4648 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4649 {4650 kfree_skb(skb, FREE_READ);
4651 release_sock(sk);
4652 return 0;
4653 }4654
4655 /* 4656 * Guess we need to make a new socket up 4657 */4658
4659 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4660
4661 /*4662 * Now we have several options: In theory there is nothing else4663 * in the frame. KA9Q has an option to send data with the syn,4664 * BSD accepts data with the syn up to the [to be] advertised window4665 * and Solaris 2.1 gives you a protocol error. For now we just ignore4666 * it, that fits the spec precisely and avoids incompatibilities. It4667 * would be nice in future to drop through and process the data.4668 */4669
4670 release_sock(sk);
4671 return 0;
4672 }4673
4674 /* retransmitted SYN? */4675 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4676 {4677 kfree_skb(skb, FREE_READ);
4678 release_sock(sk);
4679 return 0;
4680 }4681
4682 /*4683 * SYN sent means we have to look for a suitable ack and either reset4684 * for bad matches or go to connected 4685 */4686
4687 if(sk->state==TCP_SYN_SENT)
4688 {4689 /* Crossed SYN or previous junk segment */4690 if(th->ack)
4691 {4692 /* We got an ack, but its not a good ack */4693 if(!tcp_ack(sk,th,saddr,len))
4694 {4695 /* Reset the ack - its an ack from a 4696 different connection [ th->rst is checked in tcp_reset()] */4697 tcp_statistics.TcpAttemptFails++;
4698 tcp_reset(daddr, saddr, th,
4699 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4700 kfree_skb(skb, FREE_READ);
4701 release_sock(sk);
4702 return(0);
4703 }4704 if(th->rst)
4705 returntcp_std_reset(sk,skb);
4706 if(!th->syn)
4707 {4708 /* A valid ack from a different connection4709 start. Shouldn't happen but cover it */4710 kfree_skb(skb, FREE_READ);
4711 release_sock(sk);
4712 return 0;
4713 }4714 /*4715 * Ok.. its good. Set up sequence numbers and4716 * move to established.4717 */4718 syn_ok=1; /* Don't reset this connection for the syn */4719 sk->acked_seq=th->seq+1;
4720 sk->fin_seq=th->seq;
4721 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4722 tcp_set_state(sk, TCP_ESTABLISHED);
4723 tcp_options(sk,th);
4724 sk->dummy_th.dest=th->source;
4725 sk->copied_seq = sk->acked_seq;
4726 if(!sk->dead)
4727 {4728 sk->state_change(sk);
4729 sock_wake_async(sk->socket, 0);
4730 }4731 if(sk->max_window==0)
4732 {4733 sk->max_window = 32;
4734 sk->mss = min(sk->max_window, sk->mtu);
4735 }4736 }4737 else4738 {4739 /* See if SYN's cross. Drop if boring */4740 if(th->syn && !th->rst)
4741 {4742 /* Crossed SYN's are fine - but talking to4743 yourself is right out... */4744 if(sk->saddr==saddr && sk->daddr==daddr &&
4745 sk->dummy_th.source==th->source &&
4746 sk->dummy_th.dest==th->dest)
4747 {4748 tcp_statistics.TcpAttemptFails++;
4749 returntcp_std_reset(sk,skb);
4750 }4751 tcp_set_state(sk,TCP_SYN_RECV);
4752
4753 /*4754 * FIXME:4755 * Must send SYN|ACK here4756 */4757 }4758 /* Discard junk segment */4759 kfree_skb(skb, FREE_READ);
4760 release_sock(sk);
4761 return 0;
4762 }4763 /*4764 * SYN_RECV with data maybe.. drop through4765 */4766 gotorfc_step6;
4767 }4768
4769 /*4770 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is4771 * a more complex suggestion for fixing these reuse issues in RFC16444772 * but not yet ready for general use. Also see RFC1379.4773 */4774
4775 #defineBSD_TIME_WAIT4776 #ifdefBSD_TIME_WAIT4777 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4778 after(th->seq, sk->acked_seq) && !th->rst)
4779 {4780 longseq=sk->write_seq;
4781 if(sk->debug)
4782 printk("Doing a BSD time wait\n");
4783 tcp_statistics.TcpEstabResets++;
4784 sk->rmem_alloc -= skb->mem_len;
4785 skb->sk = NULL;
4786 sk->err=ECONNRESET;
4787 tcp_set_state(sk, TCP_CLOSE);
4788 sk->shutdown = SHUTDOWN_MASK;
4789 release_sock(sk);
4790 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4791 if (sk && sk->state==TCP_LISTEN)
4792 {4793 sk->inuse=1;
4794 skb->sk = sk;
4795 sk->rmem_alloc += skb->mem_len;
4796 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4797 release_sock(sk);
4798 return 0;
4799 }4800 kfree_skb(skb, FREE_READ);
4801 return 0;
4802 }4803 #endif4804 }4805
4806 /*4807 * We are now in normal data flow (see the step list in the RFC)4808 * Note most of these are inline now. I'll inline the lot when4809 * I have time to test it hard and look at what gcc outputs 4810 */4811
4812 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4813 {4814 kfree_skb(skb, FREE_READ);
4815 release_sock(sk);
4816 return 0;
4817 }4818
4819 if(th->rst)
4820 returntcp_std_reset(sk,skb);
4821
4822 /*4823 * !syn_ok is effectively the state test in RFC793.4824 */4825
4826 if(th->syn && !syn_ok)
4827 {4828 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4829 returntcp_std_reset(sk,skb);
4830 }4831
4832 /*4833 * Process the ACK4834 */4835
4836
4837 if(th->ack && !tcp_ack(sk,th,saddr,len))
4838 {4839 /*4840 * Our three way handshake failed.4841 */4842
4843 if(sk->state==TCP_SYN_RECV)
4844 {4845 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4846 }4847 kfree_skb(skb, FREE_READ);
4848 release_sock(sk);
4849 return 0;
4850 }4851
4852 rfc_step6: /* I'll clean this up later */4853
4854 /*4855 * Process urgent data4856 */4857
4858 if(tcp_urg(sk, th, saddr, len))
4859 {4860 kfree_skb(skb, FREE_READ);
4861 release_sock(sk);
4862 return 0;
4863 }4864
4865
4866 /*4867 * Process the encapsulated data4868 */4869
4870 if(tcp_data(skb,sk, saddr, len))
4871 {4872 kfree_skb(skb, FREE_READ);
4873 release_sock(sk);
4874 return 0;
4875 }4876
4877 /*4878 * And done4879 */4880
4881 release_sock(sk);
4882 return 0;
4883 }4884
4885 /*4886 * This routine sends a packet with an out of date sequence4887 * number. It assumes the other end will try to ack it.4888 */4889
4890 staticvoidtcp_write_wakeup(structsock *sk)
/* */4891 {4892 structsk_buff *buff;
4893 structtcphdr *t1;
4894 structdevice *dev=NULL;
4895 inttmp;
4896
4897 if (sk->zapped)
4898 return; /* After a valid reset we can send no more */4899
4900 /*4901 * Write data can still be transmitted/retransmitted in the4902 * following states. If any other state is encountered, return.4903 * [listen/close will never occur here anyway]4904 */4905
4906 if (sk->state != TCP_ESTABLISHED &&
4907 sk->state != TCP_CLOSE_WAIT &&
4908 sk->state != TCP_FIN_WAIT1 &&
4909 sk->state != TCP_LAST_ACK &&
4910 sk->state != TCP_CLOSING4911 )
4912 {4913 return;
4914 }4915
4916 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4917 if (buff == NULL)
4918 return;
4919
4920 buff->len = sizeof(structtcphdr);
4921 buff->free = 1;
4922 buff->sk = sk;
4923 buff->localroute = sk->localroute;
4924
4925 t1 = (structtcphdr *) buff->data;
4926
4927 /* Put in the IP header and routing stuff. */4928 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4929 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4930 if (tmp < 0)
4931 {4932 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4933 return;
4934 }4935
4936 buff->len += tmp;
4937 t1 = (structtcphdr *)((char *)t1 +tmp);
4938
4939 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4940
4941 /*4942 * Use a previous sequence.4943 * This should cause the other end to send an ack.4944 */4945
4946 t1->seq = htonl(sk->sent_seq-1);
4947 t1->ack = 1;
4948 t1->res1= 0;
4949 t1->res2= 0;
4950 t1->rst = 0;
4951 t1->urg = 0;
4952 t1->psh = 0;
4953 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */4954 t1->syn = 0;
4955 t1->ack_seq = ntohl(sk->acked_seq);
4956 t1->window = ntohs(tcp_select_window(sk));
4957 t1->doff = sizeof(*t1)/4;
4958 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4959 /*4960 * Send it and free it.4961 * This will prevent the timer from automatically being restarted.4962 */4963 sk->prot->queue_xmit(sk, dev, buff, 1);
4964 tcp_statistics.TcpOutSegs++;
4965 }4966
4967 /*4968 * A window probe timeout has occurred.4969 */4970
4971 voidtcp_send_probe0(structsock *sk)
/* */4972 {4973 if (sk->zapped)
4974 return; /* After a valid reset we can send no more */4975
4976 tcp_write_wakeup(sk);
4977
4978 sk->backoff++;
4979 sk->rto = min(sk->rto << 1, 120*HZ);
4980 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4981 sk->retransmits++;
4982 sk->prot->retransmits ++;
4983 }4984
4985 /*4986 * Socket option code for TCP. 4987 */4988
4989 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */4990 {4991 intval,err;
4992
4993 if(level!=SOL_TCP)
4994 returnip_setsockopt(sk,level,optname,optval,optlen);
4995
4996 if (optval == NULL)
4997 return(-EINVAL);
4998
4999 err=verify_area(VERIFY_READ, optval, sizeof(int));
5000 if(err)
5001 returnerr;
5002
5003 val = get_fs_long((unsignedlong *)optval);
5004
5005 switch(optname)
5006 {5007 caseTCP_MAXSEG:
5008 /*5009 * values greater than interface MTU won't take effect. however at5010 * the point when this call is done we typically don't yet know5011 * which interface is going to be used5012 */5013 if(val<1||val>MAX_WINDOW)
5014 return -EINVAL;
5015 sk->user_mss=val;
5016 return 0;
5017 caseTCP_NODELAY:
5018 sk->nonagle=(val==0)?0:1;
5019 return 0;
5020 default:
5021 return(-ENOPROTOOPT);
5022 }5023 }5024
5025 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5026 {5027 intval,err;
5028
5029 if(level!=SOL_TCP)
5030 returnip_getsockopt(sk,level,optname,optval,optlen);
5031
5032 switch(optname)
5033 {5034 caseTCP_MAXSEG:
5035 val=sk->user_mss;
5036 break;
5037 caseTCP_NODELAY:
5038 val=sk->nonagle;
5039 break;
5040 default:
5041 return(-ENOPROTOOPT);
5042 }5043 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5044 if(err)
5045 returnerr;
5046 put_fs_long(sizeof(int),(unsignedlong *) optlen);
5047
5048 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5049 if(err)
5050 returnerr;
5051 put_fs_long(val,(unsignedlong *)optval);
5052
5053 return(0);
5054 }5055
5056
5057 structprototcp_prot = {5058 sock_wmalloc,
5059 sock_rmalloc,
5060 sock_wfree,
5061 sock_rfree,
5062 sock_rspace,
5063 sock_wspace,
5064 tcp_close,
5065 tcp_read,
5066 tcp_write,
5067 tcp_sendto,
5068 tcp_recvfrom,
5069 ip_build_header,
5070 tcp_connect,
5071 tcp_accept,
5072 ip_queue_xmit,
5073 tcp_retransmit,
5074 tcp_write_wakeup,
5075 tcp_read_wakeup,
5076 tcp_rcv,
5077 tcp_select,
5078 tcp_ioctl,
5079 NULL,
5080 tcp_shutdown,
5081 tcp_setsockopt,
5082 tcp_getsockopt,
5083 128,
5084 0,
5085 {NULL,},
5086 "TCP",
5087 0, 0
5088 };