1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@no.unit.nvg> 20 * 21 * Fixes: 22 * Alan Cox : Numerous verify_area() calls 23 * Alan Cox : Set the ACK bit on a reset 24 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 25 * and was trying to connect (tcp_err()). 26 * Alan Cox : All icmp error handling was broken 27 * pointers passed where wrong and the 28 * socket was looked up backwards. Nobody 29 * tested any icmp error code obviously. 30 * Alan Cox : tcp_err() now handled properly. It wakes people 31 * on errors. select behaves and the icmp error race 32 * has gone by moving it into sock.c 33 * Alan Cox : tcp_reset() fixed to work for everything not just 34 * packets for unknown sockets. 35 * Alan Cox : tcp option processing. 36 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] 37 * Herp Rosmanith : More reset fixes 38 * Alan Cox : No longer acks invalid rst frames. Acking 39 * any kind of RST is right out. 40 * Alan Cox : Sets an ignore me flag on an rst receive 41 * otherwise odd bits of prattle escape still 42 * Alan Cox : Fixed another acking RST frame bug. Should stop 43 * LAN workplace lockups. 44 * Alan Cox : Some tidyups using the new skb list facilities 45 * Alan Cox : sk->keepopen now seems to work 46 * Alan Cox : Pulls options out correctly on accepts 47 * Alan Cox : Fixed assorted sk->rqueue->next errors 48 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. 49 * Alan Cox : Tidied tcp_data to avoid a potential nasty. 50 * Alan Cox : Added some better commenting, as the tcp is hard to follow 51 * Alan Cox : Removed incorrect check for 20 * psh 52 * Michael O'Reilly : ack < copied bug fix. 53 * Johannes Stille : Misc tcp fixes (not all in yet). 54 * Alan Cox : FIN with no memory -> CRASH 55 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. 56 * Alan Cox : Added TCP options (SOL_TCP) 57 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. 58 * Alan Cox : Use ip_tos/ip_ttl settings. 59 * Alan Cox : Handle FIN (more) properly (we hope). 60 * Alan Cox : RST frames sent on unsynchronised state ack error/ 61 * Alan Cox : Put in missing check for SYN bit. 62 * Alan Cox : Added tcp_select_window() aka NET2E 63 * window non shrink trick. 64 * Alan Cox : Added a couple of small NET2E timer fixes 65 * Charles Hedrick : TCP fixes 66 * Toomas Tamm : TCP window fixes 67 * Alan Cox : Small URG fix to rlogin ^C ack fight 68 * Charles Hedrick : Rewrote most of it to actually work 69 * Linus : Rewrote tcp_read() and URG handling 70 * completely 71 * Gerhard Koerting: Fixed some missing timer handling 72 * Matthew Dillon : Reworked TCP machine states as per RFC 73 * Gerhard Koerting: PC/TCP workarounds 74 * Adam Caldwell : Assorted timer/timing errors 75 * Matthew Dillon : Fixed another RST bug 76 * Alan Cox : Move to kernel side addressing changes. 77 * Alan Cox : Beginning work on TCP fastpathing (not yet usable) 78 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 79 * Alan Cox : TCP fast path debugging 80 * Alan Cox : Window clamping 81 * Michael Riepe : Bug in tcp_check() 82 * Matt Dillon : More TCP improvements and RST bug fixes 83 * Matt Dillon : Yet more small nasties remove from the TCP code 84 * (Be very nice to this man if tcp finally works 100%) 8) 85 * Alan Cox : BSD accept semantics. 86 * Alan Cox : Reset on closedown bug. 87 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 88 * Michael Pall : Handle select() after URG properly in all cases. 89 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). 90 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. 91 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. 92 * Alan Cox : Changed the semantics of sk->socket to 93 * fix a race and a signal problem with 94 * accept() and async I/O. 95 * Alan Cox : Relaxed the rules on tcp_sendto(). 96 * Yury Shevchuk : Really fixed accept() blocking problem. 97 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 98 * clients/servers which listen in on 99 * fixed ports. 100 * Alan Cox : Cleaned the above up and shrank it to 101 * a sensible code size. 102 * Alan Cox : Self connect lockup fix. 103 * Alan Cox : No connect to multicast. 104 * Ross Biro : Close unaccepted children on master 105 * socket close. 106 * Alan Cox : Reset tracing code. 107 * Alan Cox : Spurious resets on shutdown. 108 * Alan Cox : Giant 15 minute/60 second timer error 109 * Alan Cox : Small whoops in selecting before an accept. 110 * Alan Cox : Kept the state trace facility since its 111 * handy for debugging. 112 * Alan Cox : More reset handler fixes. 113 * Alan Cox : Started rewriting the code based on the RFC's 114 * for other useful protocol references see: 115 * Comer, KA9Q NOS, and for a reference on the 116 * difference between specifications and how BSD 117 * works see the 4.4lite source. 118 * A.N.Kuznetsov : Don't time wait on completion of tidy 119 * close. 120 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 121 * Linus Torvalds : Fixed BSD port reuse to work first syn 122 * Alan Cox : Reimplemented timers as per the RFC and using multiple 123 * timers for sanity. 124 * Alan Cox : Small bug fixes, and a lot of new 125 * comments. 126 * Alan Cox : Fixed dual reader crash by locking 127 * the buffers (much like datagram.c) 128 * Alan Cox : Fixed stuck sockets in probe. A probe 129 * now gets fed up of retrying without 130 * (even a no space) answer. 131 * Alan Cox : Extracted closing code better 132 * Alan Cox : Fixed the closing state machine to 133 * resemble the RFC. 134 * Alan Cox : More 'per spec' fixes. 135 * 136 * 137 * To Fix: 138 * Fast path the code. Two things here - fix the window calculation 139 * so it doesn't iterate over the queue, also spot packets with no funny 140 * options arriving in order and process directly. 141 * 142 * Implement RFC 1191 [Path MTU discovery] 143 * Look at the effect of implementing RFC 1337 suggestions and their impact. 144 * Rewrite output state machine to use a single queue and do low window 145 * situations as per the spec (RFC 1122) 146 * Speed up input assembly algorithm. 147 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 148 * could do with it working on IPv4 149 * User settable/learned rtt/max window/mtu 150 * Cope with MTU/device switches when retransmitting in tcp. 151 * Fix the window handling to use PR's new code. 152 * 153 * Change the fundamental structure to a single send queue maintained 154 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 155 * active routes too]). Cut the queue off in tcp_retransmit/ 156 * tcp_transmit. 157 * Change the receive queue to assemble as it goes. This lets us 158 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 159 * tcp_data/tcp_read as well as the window shrink crud. 160 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 161 * tcp_queue_skb seem obvious routines to extract. 162 * 163 * This program is free software; you can redistribute it and/or 164 * modify it under the terms of the GNU General Public License 165 * as published by the Free Software Foundation; either version 166 * 2 of the License, or(at your option) any later version. 167 * 168 * Description of States: 169 * 170 * TCP_SYN_SENT sent a connection request, waiting for ack 171 * 172 * TCP_SYN_RECV received a connection request, sent ack, 173 * waiting for final ack in three-way handshake. 174 * 175 * TCP_ESTABLISHED connection established 176 * 177 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 178 * transmission of remaining buffered data 179 * 180 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 181 * to shutdown 182 * 183 * TCP_CLOSING both sides have shutdown but we still have 184 * data we have to finish sending 185 * 186 * TCP_TIME_WAIT timeout to catch resent junk before entering 187 * closed, can only be entered from FIN_WAIT2 188 * or CLOSING. Required because the other end 189 * may not have gotten our last ACK causing it 190 * to retransmit the data packet (which we ignore) 191 * 192 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 193 * us to finish writing our data and to shutdown 194 * (we have to close() to move on to LAST_ACK) 195 * 196 * TCP_LAST_ACK out side has shutdown after remote has 197 * shutdown. There may still be data in our 198 * buffer that we have to finish sending 199 * 200 * TCP_CLOSE socket is finished 201 */ 202
203 #include <linux/types.h>
204 #include <linux/sched.h>
205 #include <linux/mm.h>
206 #include <linux/time.h>
207 #include <linux/string.h>
208 #include <linux/config.h>
209 #include <linux/socket.h>
210 #include <linux/sockios.h>
211 #include <linux/termios.h>
212 #include <linux/in.h>
213 #include <linux/fcntl.h>
214 #include <linux/inet.h>
215 #include <linux/netdevice.h>
216 #include "snmp.h"
217 #include "ip.h"
218 #include "protocol.h"
219 #include "icmp.h"
220 #include "tcp.h"
221 #include "arp.h"
222 #include <linux/skbuff.h>
223 #include "sock.h"
224 #include "route.h"
225 #include <linux/errno.h>
226 #include <linux/timer.h>
227 #include <asm/system.h>
228 #include <asm/segment.h>
229 #include <linux/mm.h>
230
231 /* 232 * The MSL timer is the 'normal' timer. 233 */ 234
235 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
236
237 #defineSEQ_TICK 3
238 unsignedlongseq_offset;
239 structtcp_mibtcp_statistics;
240
241 staticvoidtcp_close(structsock *sk, inttimeout);
242
243
244 /* 245 * The less said about this the better, but it works and will do for 1.2 246 */ 247
248 staticstructwait_queue *master_select_wakeup;
249
250 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 251 { 252 if (a < b)
253 return(a);
254 return(b);
255 } 256
257 #undefSTATE_TRACE 258
259 #ifdefSTATE_TRACE 260 staticchar *statename[]={ 261 "Unused","Established","Syn Sent","Syn Recv",
262 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
263 "Close Wait","Last ACK","Listen","Closing"
264 };
265 #endif 266
267 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 268 { 269 if(sk->state==TCP_ESTABLISHED)
270 tcp_statistics.TcpCurrEstab--;
271 #ifdefSTATE_TRACE 272 if(sk->debug)
273 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
274 #endif 275 /* This is a hack but it doesn't occur often and its going to 276 be a real to fix nicely */ 277
278 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
279 { 280 wake_up_interruptible(&master_select_wakeup);
281 } 282 sk->state=state;
283 if(state==TCP_ESTABLISHED)
284 tcp_statistics.TcpCurrEstab++;
285 } 286
287 /* 288 * This routine picks a TCP windows for a socket based on 289 * the following constraints 290 * 291 * 1. The window can never be shrunk once it is offered (RFC 793) 292 * 2. We limit memory per socket 293 * 294 * For now we use NET2E3's heuristic of offering half the memory 295 * we have handy. All is not as bad as this seems however because 296 * of two things. Firstly we will bin packets even within the window 297 * in order to get the data we are waiting for into the memory limit. 298 * Secondly we bin common duplicate forms at receive time 299 * Better heuristics welcome 300 */ 301
302 inttcp_select_window(structsock *sk)
/* */ 303 { 304 intnew_window = sk->prot->rspace(sk);
305
306 if(sk->window_clamp)
307 new_window=min(sk->window_clamp,new_window);
308 /* 309 * Two things are going on here. First, we don't ever offer a 310 * window less than min(sk->mss, MAX_WINDOW/2). This is the 311 * receiver side of SWS as specified in RFC1122. 312 * Second, we always give them at least the window they 313 * had before, in order to avoid retracting window. This 314 * is technically allowed, but RFC1122 advises against it and 315 * in practice it causes trouble. 316 * 317 * Fixme: This doesn't correctly handle the case where 318 * new_window > sk->window but not by enough to allow for the 319 * shift in sequence space. 320 */ 321 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
322 return(sk->window);
323 return(new_window);
324 } 325
326 /* 327 * Find someone to 'accept'. Must be called with 328 * sk->inuse=1 or cli() 329 */ 330
331 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 332 { 333 structsk_buff *p=skb_peek(&s->receive_queue);
334 if(p==NULL)
335 returnNULL;
336 do 337 { 338 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
339 returnp;
340 p=p->next;
341 } 342 while(p!=(structsk_buff *)&s->receive_queue);
343 returnNULL;
344 } 345
346 /* 347 * Remove a completed connection and return it. This is used by 348 * tcp_accept() to get connections from the queue. 349 */ 350
351 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 352 { 353 structsk_buff *skb;
354 unsignedlongflags;
355 save_flags(flags);
356 cli();
357 skb=tcp_find_established(s);
358 if(skb!=NULL)
359 skb_unlink(skb); /* Take it off the queue */ 360 restore_flags(flags);
361 returnskb;
362 } 363
364 /* 365 * This routine closes sockets which have been at least partially 366 * opened, but not yet accepted. Currently it is only called by 367 * tcp_close, and timeout mirrors the value there. 368 */ 369
370 staticvoidtcp_close_pending (structsock *sk)
/* */ 371 { 372 structsk_buff *skb;
373
374 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { 375 tcp_close(skb->sk, 0);
376 kfree_skb(skb, FREE_READ);
377 } 378 return;
379 } 380
381 /* 382 * Enter the time wait state. 383 */ 384
385 staticvoidtcp_time_wait(structsock *sk)
/* */ 386 { 387 tcp_set_state(sk,TCP_TIME_WAIT);
388 sk->shutdown = SHUTDOWN_MASK;
389 if (!sk->dead)
390 sk->state_change(sk);
391 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
392 } 393
394 /* 395 * A socket has timed out on its send queue and wants to do a 396 * little retransmitting. Currently this means TCP. 397 */ 398
399 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 400 { 401 structsk_buff * skb;
402 structproto *prot;
403 structdevice *dev;
404 intct=0;
405
406 prot = sk->prot;
407 skb = sk->send_head;
408
409 while (skb != NULL)
410 { 411 structtcphdr *th;
412 structiphdr *iph;
413 intsize;
414
415 dev = skb->dev;
416 IS_SKB(skb);
417 skb->when = jiffies;
418
419 /* 420 * In general it's OK just to use the old packet. However we 421 * need to use the current ack and window fields. Urg and 422 * urg_ptr could possibly stand to be updated as well, but we 423 * don't keep the necessary data. That shouldn't be a problem, 424 * if the other end is doing the right thing. Since we're 425 * changing the packet, we have to issue a new IP identifier. 426 */ 427
428 iph = (structiphdr *)(skb->data + dev->hard_header_len);
429 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
430 size = skb->len - (((unsignedchar *) th) - skb->data);
431
432 /* 433 * Note: We ought to check for window limits here but 434 * currently this is done (less efficiently) elsewhere. 435 * We do need to check for a route change but can't handle 436 * that until we have the new 1.3.x buffers in. 437 * 438 */ 439
440 iph->id = htons(ip_id_count++);
441 ip_send_check(iph);
442
443 /* 444 * This is not the right way to handle this. We have to 445 * issue an up to date window and ack report with this 446 * retransmit to keep the odd buggy tcp that relies on 447 * the fact BSD does this happy. 448 * We don't however need to recalculate the entire 449 * checksum, so someone wanting a small problem to play 450 * with might like to implement RFC1141/RFC1624 and speed 451 * this up by avoiding a full checksum. 452 */ 453
454 th->ack_seq = ntohl(sk->acked_seq);
455 th->window = ntohs(tcp_select_window(sk));
456 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
457
458 /* 459 * If the interface is (still) up and running, kick it. 460 */ 461
462 if (dev->flags & IFF_UP)
463 { 464 /* 465 * If the packet is still being sent by the device/protocol 466 * below then don't retransmit. This is both needed, and good - 467 * especially with connected mode AX.25 where it stops resends 468 * occurring of an as yet unsent anyway frame! 469 * We still add up the counts as the round trip time wants 470 * adjusting. 471 */ 472 if (sk && !skb_device_locked(skb))
473 { 474 /* Remove it from any existing driver queue first! */ 475 skb_unlink(skb);
476 /* Now queue it */ 477 ip_statistics.IpOutRequests++;
478 dev_queue_xmit(skb, dev, sk->priority);
479 } 480 } 481
482 /* 483 * Count retransmissions 484 */ 485
486 ct++;
487 sk->prot->retransmits ++;
488
489 /* 490 * Only one retransmit requested. 491 */ 492
493 if (!all)
494 break;
495
496 /* 497 * This should cut it off before we send too many packets. 498 */ 499
500 if (ct >= sk->cong_window)
501 break;
502 skb = skb->link3;
503 } 504 } 505
506 /* 507 * Reset the retransmission timer 508 */ 509
510 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 511 { 512 del_timer(&sk->retransmit_timer);
513 sk->ip_xmit_timeout = why;
514 if((int)when < 0)
515 { 516 when=3;
517 printk("Error: Negative timer in xmit_timer\n");
518 } 519 sk->retransmit_timer.expires=when;
520 add_timer(&sk->retransmit_timer);
521 } 522
523 /* 524 * This is the normal code called for timeouts. It does the retransmission 525 * and then does backoff. tcp_do_retransmit is separated out because 526 * tcp_ack needs to send stuff from the retransmit queue without 527 * initiating a backoff. 528 */ 529
530
531 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 532 { 533 tcp_do_retransmit(sk, all);
534
535 /* 536 * Increase the timeout each time we retransmit. Note that 537 * we do not increase the rtt estimate. rto is initialized 538 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 539 * that doubling rto each time is the least we can get away with. 540 * In KA9Q, Karn uses this for the first few times, and then 541 * goes to quadratic. netBSD doubles, but only goes up to *64, 542 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 543 * defined in the protocol as the maximum possible RTT. I guess 544 * we'll have to use something other than TCP to talk to the 545 * University of Mars. 546 * 547 * PAWS allows us longer timeouts and large windows, so once 548 * implemented ftp to mars will work nicely. We will have to fix 549 * the 120 second clamps though! 550 */ 551
552 sk->retransmits++;
553 sk->backoff++;
554 sk->rto = min(sk->rto << 1, 120*HZ);
555 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
556 } 557
558
559 /* 560 * A timer event has trigger a tcp retransmit timeout. The 561 * socket xmit queue is ready and set up to send. Because 562 * the ack receive code keeps the queue straight we do 563 * nothing clever here. 564 */ 565
566 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 567 { 568 if (all)
569 { 570 tcp_retransmit_time(sk, all);
571 return;
572 } 573
574 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 575 /* sk->ssthresh in theory can be zero. I guess that's OK */ 576 sk->cong_count = 0;
577
578 sk->cong_window = 1;
579
580 /* Do the actual retransmit. */ 581 tcp_retransmit_time(sk, all);
582 } 583
584 /* 585 * A write timeout has occurred. Process the after effects. 586 */ 587
588 staticinttcp_write_timeout(structsock *sk)
/* */ 589 { 590 /* 591 * Look for a 'soft' timeout. 592 */ 593 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
594 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
595 { 596 /* 597 * Attempt to recover if arp has changed (unlikely!) or 598 * a route has shifted (not supported prior to 1.3). 599 */ 600 arp_destroy (sk->daddr, 0);
601 ip_route_check (sk->daddr);
602 } 603 /* 604 * Has it gone just too far ? 605 */ 606 if (sk->retransmits > TCP_RETR2)
607 { 608 sk->err = ETIMEDOUT;
609 sk->error_report(sk);
610 /* 611 * Time wait the socket 612 */ 613 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING)
614 { 615 tcp_set_state(sk,TCP_TIME_WAIT);
616 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
617 } 618 else 619 { 620 /* 621 * Clean up time. 622 */ 623 tcp_set_state(sk, TCP_CLOSE);
624 return 0;
625 } 626 } 627 return 1;
628 } 629
630 /* 631 * The TCP retransmit timer. This lacks a few small details. 632 * 633 * 1. An initial rtt timeout on the probe0 should cause what we can 634 * of the first write queue buffer to be split and sent. 635 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 636 * ETIMEDOUT if we know an additional 'soft' error caused this. 637 * tcp_err should save a 'soft error' for us. 638 */ 639
640 staticvoidretransmit_timer(unsignedlongdata)
/* */ 641 { 642 structsock *sk = (structsock*)data;
643 intwhy = sk->ip_xmit_timeout;
644
645 /* 646 * only process if socket is not in use 647 */ 648
649 cli();
650 if (sk->inuse || in_bh)
651 { 652 /* Try again in 1 second */ 653 sk->retransmit_timer.expires = HZ;
654 add_timer(&sk->retransmit_timer);
655 sti();
656 return;
657 } 658
659 sk->inuse = 1;
660 sti();
661
662 /* Always see if we need to send an ack. */ 663
664 if (sk->ack_backlog && !sk->zapped)
665 { 666 sk->prot->read_wakeup (sk);
667 if (! sk->dead)
668 sk->data_ready(sk,0);
669 } 670
671 /* Now we need to figure out why the socket was on the timer. */ 672
673 switch (why)
674 { 675 /* Window probing */ 676 caseTIME_PROBE0:
677 tcp_send_probe0(sk);
678 tcp_write_timeout(sk);
679 break;
680 /* Retransmitting */ 681 caseTIME_WRITE:
682 /* It could be we got here because we needed to send an ack. 683 * So we need to check for that. 684 */ 685 { 686 structsk_buff *skb;
687 unsignedlongflags;
688
689 save_flags(flags);
690 cli();
691 skb = sk->send_head;
692 if (!skb)
693 { 694 restore_flags(flags);
695 } 696 else 697 { 698 /* 699 * Kicked by a delayed ack. Reset timer 700 * correctly now 701 */ 702 if (jiffies < skb->when + sk->rto)
703 { 704 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
705 restore_flags(flags);
706 break;
707 } 708 restore_flags(flags);
709 /* 710 * Retransmission 711 */ 712 sk->prot->retransmit (sk, 0);
713 tcp_write_timeout(sk);
714 } 715 break;
716 } 717 /* Sending Keepalives */ 718 caseTIME_KEEPOPEN:
719 /* 720 * this reset_timer() call is a hack, this is not 721 * how KEEPOPEN is supposed to work. 722 */ 723 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
724
725 /* Send something to keep the connection open. */ 726 if (sk->prot->write_wakeup)
727 sk->prot->write_wakeup (sk);
728 sk->retransmits++;
729 tcp_write_timeout(sk);
730 break;
731 default:
732 printk ("rexmit_timer: timer expired - reason unknown\n");
733 break;
734 } 735 release_sock(sk);
736 } 737
738 /* 739 * This routine is called by the ICMP module when it gets some 740 * sort of error condition. If err < 0 then the socket should 741 * be closed and the error returned to the user. If err > 0 742 * it's just the icmp type << 8 | icmp code. After adjustment 743 * header points to the first 8 bytes of the tcp header. We need 744 * to find the appropriate port. 745 */ 746
747 voidtcp_err(interr, unsignedchar *header, unsignedlongdaddr,
/* */ 748 unsignedlongsaddr, structinet_protocol *protocol)
749 { 750 structtcphdr *th;
751 structsock *sk;
752 structiphdr *iph=(structiphdr *)header;
753
754 header+=4*iph->ihl;
755
756
757 th =(structtcphdr *)header;
758 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
759
760 if (sk == NULL)
761 return;
762
763 if(err<0)
764 { 765 sk->err = -err;
766 sk->error_report(sk);
767 return;
768 } 769
770 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
771 { 772 /* 773 * FIXME: 774 * For now we will just trigger a linear backoff. 775 * The slow start code should cause a real backoff here. 776 */ 777 if (sk->cong_window > 4)
778 sk->cong_window--;
779 return;
780 } 781
782 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ 783
784 /* 785 * If we've already connected we will keep trying 786 * until we time out, or the user gives up. 787 */ 788
789 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
790 { 791 if (sk->state == TCP_SYN_SENT)
792 { 793 tcp_statistics.TcpAttemptFails++;
794 tcp_set_state(sk,TCP_CLOSE);
795 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 796 } 797 sk->err = icmp_err_convert[err & 0xff].errno;
798 } 799 return;
800 } 801
802
803 /* 804 * Walk down the receive queue counting readable data until we hit the end or we find a gap 805 * in the received data queue (ie a frame missing that needs sending to us). Not 806 * sorting using two queues as data arrives makes life so much harder. 807 */ 808
809 staticinttcp_readable(structsock *sk)
/* */ 810 { 811 unsignedlongcounted;
812 unsignedlongamount;
813 structsk_buff *skb;
814 intsum;
815 unsignedlongflags;
816
817 if(sk && sk->debug)
818 printk("tcp_readable: %p - ",sk);
819
820 save_flags(flags);
821 cli();
822 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
823 { 824 restore_flags(flags);
825 if(sk && sk->debug)
826 printk("empty\n");
827 return(0);
828 } 829
830 counted = sk->copied_seq; /* Where we are at the moment */ 831 amount = 0;
832
833 /* 834 * Do until a push or until we are out of data. 835 */ 836
837 do 838 { 839 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ 840 break;
841 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 842 if (skb->h.th->syn)
843 sum++;
844 if (sum > 0)
845 {/* Add it up, move on */ 846 amount += sum;
847 if (skb->h.th->syn)
848 amount--;
849 counted += sum;
850 } 851 /* 852 * Don't count urg data ... but do it in the right place! 853 * Consider: "old_data (ptr is here) URG PUSH data" 854 * The old code would stop at the first push because 855 * it counted the urg (amount==1) and then does amount-- 856 * *after* the loop. This means tcp_readable() always 857 * returned zero if any URG PUSH was in the queue, even 858 * though there was normal data available. If we subtract 859 * the urg data right here, we even get it to work for more 860 * than one URG PUSH skb without normal data. 861 * This means that select() finally works now with urg data 862 * in the queue. Note that rlogin was never affected 863 * because it doesn't use select(); it uses two processes 864 * and a blocking read(). And the queue scan in tcp_read() 865 * was correct. Mike <pall@rz.uni-karlsruhe.de> 866 */ 867 if (skb->h.th->urg)
868 amount--; /* don't count urg data */ 869 if (amount && skb->h.th->psh) break;
870 skb = skb->next;
871 } 872 while(skb != (structsk_buff *)&sk->receive_queue);
873
874 restore_flags(flags);
875 if(sk->debug)
876 printk("got %lu bytes.\n",amount);
877 return(amount);
878 } 879
880 /* 881 * LISTEN is a special case for select.. 882 */ 883 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 884 { 885 if (sel_type == SEL_IN) { 886 intretval;
887
888 sk->inuse = 1;
889 retval = (tcp_find_established(sk) != NULL);
890 release_sock(sk);
891 if (!retval)
892 select_wait(&master_select_wakeup,wait);
893 returnretval;
894 } 895 return 0;
896 } 897
898
899 /* 900 * Wait for a TCP event. 901 * 902 * Note that we don't need to set "sk->inuse", as the upper select layers 903 * take care of normal races (between the test and the event) and we don't 904 * go look at any of the socket buffers directly. 905 */ 906 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 907 { 908 if (sk->state == TCP_LISTEN)
909 returntcp_listen_select(sk, sel_type, wait);
910
911 switch(sel_type) { 912 caseSEL_IN:
913 if (sk->err)
914 return 1;
915 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
916 break;
917
918 if (sk->shutdown & RCV_SHUTDOWN)
919 return 1;
920
921 if (sk->acked_seq == sk->copied_seq)
922 break;
923
924 if (sk->urg_seq != sk->copied_seq ||
925 sk->acked_seq != sk->copied_seq+1 ||
926 sk->urginline || !sk->urg_data)
927 return 1;
928 break;
929
930 caseSEL_OUT:
931 if (sk->shutdown & SEND_SHUTDOWN)
932 return 0;
933 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
934 break;
935 /* 936 * This is now right thanks to a small fix 937 * by Matt Dillon. 938 */ 939
940 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
941 break;
942 return 1;
943
944 caseSEL_EX:
945 if (sk->err || sk->urg_data)
946 return 1;
947 break;
948 } 949 select_wait(sk->sleep, wait);
950 return 0;
951 } 952
953 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 954 { 955 interr;
956 switch(cmd)
957 { 958
959 caseTIOCINQ:
960 #ifdef FIXME /* FIXME: */ 961 caseFIONREAD:
962 #endif 963 { 964 unsignedlongamount;
965
966 if (sk->state == TCP_LISTEN)
967 return(-EINVAL);
968
969 sk->inuse = 1;
970 amount = tcp_readable(sk);
971 release_sock(sk);
972 err=verify_area(VERIFY_WRITE,(void *)arg,
973 sizeof(unsignedlong));
974 if(err)
975 returnerr;
976 put_fs_long(amount,(unsignedlong *)arg);
977 return(0);
978 } 979 caseSIOCATMARK:
980 { 981 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
982
983 err = verify_area(VERIFY_WRITE,(void *) arg,
984 sizeof(unsignedlong));
985 if (err)
986 returnerr;
987 put_fs_long(answ,(int *) arg);
988 return(0);
989 } 990 caseTIOCOUTQ:
991 { 992 unsignedlongamount;
993
994 if (sk->state == TCP_LISTEN) return(-EINVAL);
995 amount = sk->prot->wspace(sk);
996 err=verify_area(VERIFY_WRITE,(void *)arg,
997 sizeof(unsignedlong));
998 if(err)
999 returnerr;
1000 put_fs_long(amount,(unsignedlong *)arg);
1001 return(0);
1002 }1003 default:
1004 return(-EINVAL);
1005 }1006 }1007
1008
1009 /*1010 * This routine computes a TCP checksum. 1011 */1012
1013 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1014 unsignedlongsaddr, unsignedlongdaddr)
1015 {1016 unsignedlongsum;
1017
1018 if (saddr == 0) saddr = ip_my_addr();
1019
1020 /*1021 * stupid, gcc complains when I use just one __asm__ block,1022 * something about too many reloads, but this is just two1023 * instructions longer than what I want1024 */1025 __asm__("
1026 addl %%ecx, %%ebx
1027 adcl %%edx, %%ebx
1028 adcl $0, %%ebx
1029 "
1030 : "=b"(sum)
1031 : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1032 : "bx", "cx", "dx" );
1033 __asm__("
1034 movl %%ecx, %%edx
1035 cld
1036 cmpl $32, %%ecx
1037 jb 2f
1038 shrl $5, %%ecx
1039 clc
1040 1: lodsl
1041 adcl %%eax, %%ebx
1042 lodsl
1043 adcl %%eax, %%ebx
1044 lodsl
1045 adcl %%eax, %%ebx
1046 lodsl
1047 adcl %%eax, %%ebx
1048 lodsl
1049 adcl %%eax, %%ebx
1050 lodsl
1051 adcl %%eax, %%ebx
1052 lodsl
1053 adcl %%eax, %%ebx
1054 lodsl
1055 adcl %%eax, %%ebx
1056 loop 1b
1057 adcl $0, %%ebx
1058 movl %%edx, %%ecx
1059 2: andl $28, %%ecx
1060 je 4f
1061 shrl $2, %%ecx
1062 clc
1063 3: lodsl
1064 adcl %%eax, %%ebx
1065 loop 3b
1066 adcl $0, %%ebx
1067 4: movl $0, %%eax
1068 testw $2, %%dx
1069 je 5f
1070 lodsw
1071 addl %%eax, %%ebx
1072 adcl $0, %%ebx
1073 movw $0, %%ax
1074 5: test $1, %%edx
1075 je 6f
1076 lodsb
1077 addl %%eax, %%ebx
1078 adcl $0, %%ebx
1079 6: movl %%ebx, %%eax
1080 shrl $16, %%eax
1081 addw %%ax, %%bx
1082 adcw $0, %%bx
1083 "
1084 : "=b"(sum)
1085 : "0"(sum), "c"(len), "S"(th)
1086 : "ax", "bx", "cx", "dx", "si" );
1087
1088 /* We only want the bottom 16 bits, but we never cleared the top 16. */1089
1090 return((~sum) & 0xffff);
1091 }1092
1093
1094
1095 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1096 unsignedlongdaddr, intlen, structsock *sk)
1097 {1098 th->check = 0;
1099 th->check = tcp_check(th, len, saddr, daddr);
1100 return;
1101 }1102
1103 /*1104 * This is the main buffer sending routine. We queue the buffer1105 * having checked it is sane seeming.1106 */1107
1108 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1109 {1110 intsize;
1111 structtcphdr * th = skb->h.th;
1112
1113 /*1114 * length of packet (not counting length of pre-tcp headers) 1115 */1116
1117 size = skb->len - ((unsignedchar *) th - skb->data);
1118
1119 /*1120 * Sanity check it.. 1121 */1122
1123 if (size < sizeof(structtcphdr) || size > skb->len)
1124 {1125 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1126 skb, skb->data, th, skb->len);
1127 kfree_skb(skb, FREE_WRITE);
1128 return;
1129 }1130
1131 /*1132 * If we have queued a header size packet.. (these crash a few1133 * tcp stacks if ack is not set)1134 */1135
1136 if (size == sizeof(structtcphdr))
1137 {1138 /* If its got a syn or fin its notionally included in the size..*/1139 if(!th->syn && !th->fin)
1140 {1141 printk("tcp_send_skb: attempt to queue a bogon.\n");
1142 kfree_skb(skb,FREE_WRITE);
1143 return;
1144 }1145 }1146
1147 /*1148 * Actual processing.1149 */1150
1151 tcp_statistics.TcpOutSegs++;
1152 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1153
1154 /*1155 * We must queue if1156 *1157 * a) The right edge of this frame exceeds the window1158 * b) We are retransmitting (Nagle's rule)1159 * c) We have too many packets 'in flight'1160 */1161
1162 if (after(skb->h.seq, sk->window_seq) ||
1163 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1164 sk->packets_out >= sk->cong_window)
1165 {1166 /* checksum will be supplied by tcp_write_xmit. So1167 * we shouldn't need to set it at all. I'm being paranoid */1168 th->check = 0;
1169 if (skb->next != NULL)
1170 {1171 printk("tcp_send_partial: next != NULL\n");
1172 skb_unlink(skb);
1173 }1174 skb_queue_tail(&sk->write_queue, skb);
1175
1176 /*1177 * If we don't fit we have to start the zero window1178 * probes. This is broken - we really need to do a partial1179 * send _first_ (This is what causes the Cisco and PC/TCP1180 * grief).1181 */1182
1183 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1184 sk->send_head == NULL && sk->ack_backlog == 0)
1185 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1186 }1187 else1188 {1189 /*1190 * This is going straight out1191 */1192
1193 th->ack_seq = ntohl(sk->acked_seq);
1194 th->window = ntohs(tcp_select_window(sk));
1195
1196 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1197
1198 sk->sent_seq = sk->write_seq;
1199
1200 /*1201 * This is mad. The tcp retransmit queue is put together1202 * by the ip layer. This causes half the problems with1203 * unroutable FIN's and other things.1204 */1205
1206 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1207
1208 /*1209 * Set for next retransmit based on expected ACK time.1210 * FIXME: We set this every time which means our 1211 * retransmits are really about a window behind.1212 */1213
1214 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1215 }1216 }1217
1218 /*1219 * Locking problems lead us to a messy situation where we can have1220 * multiple partially complete buffers queued up. This is really bad1221 * as we don't want to be sending partial buffers. Fix this with1222 * a semaphore or similar to lock tcp_write per socket.1223 *1224 * These routines are pretty self descriptive.1225 */1226
1227 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1228 {1229 structsk_buff * skb;
1230 unsignedlongflags;
1231
1232 save_flags(flags);
1233 cli();
1234 skb = sk->partial;
1235 if (skb) {1236 sk->partial = NULL;
1237 del_timer(&sk->partial_timer);
1238 }1239 restore_flags(flags);
1240 returnskb;
1241 }1242
1243 /*1244 * Empty the partial queue1245 */1246
1247 staticvoidtcp_send_partial(structsock *sk)
/* */1248 {1249 structsk_buff *skb;
1250
1251 if (sk == NULL)
1252 return;
1253 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1254 tcp_send_skb(sk, skb);
1255 }1256
1257 /*1258 * Queue a partial frame1259 */1260
1261 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1262 {1263 structsk_buff * tmp;
1264 unsignedlongflags;
1265
1266 save_flags(flags);
1267 cli();
1268 tmp = sk->partial;
1269 if (tmp)
1270 del_timer(&sk->partial_timer);
1271 sk->partial = skb;
1272 init_timer(&sk->partial_timer);
1273 /*1274 * Wait up to 1 second for the buffer to fill.1275 */1276 sk->partial_timer.expires = HZ;
1277 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1278 sk->partial_timer.data = (unsignedlong) sk;
1279 add_timer(&sk->partial_timer);
1280 restore_flags(flags);
1281 if (tmp)
1282 tcp_send_skb(sk, tmp);
1283 }1284
1285
1286 /*1287 * This routine sends an ack and also updates the window. 1288 */1289
1290 staticvoidtcp_send_ack(unsignedlongsequence, unsignedlongack,
/* */1291 structsock *sk,
1292 structtcphdr *th, unsignedlongdaddr)
1293 {1294 structsk_buff *buff;
1295 structtcphdr *t1;
1296 structdevice *dev = NULL;
1297 inttmp;
1298
1299 if(sk->zapped)
1300 return; /* We have been reset, we may not send again */1301
1302 /*1303 * We need to grab some memory, and put together an ack,1304 * and then put it into the queue to be sent.1305 */1306
1307 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1308 if (buff == NULL)
1309 {1310 /* 1311 * Force it to send an ack. We don't have to do this1312 * (ACK is unreliable) but its much better use of 1313 * bandwidth on slow links to send a spare ack than1314 * resend packets. 1315 */1316
1317 sk->ack_backlog++;
1318 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1319 {1320 reset_xmit_timer(sk, TIME_WRITE, HZ);
1321 }1322 return;
1323 }1324
1325 /*1326 * Assemble a suitable TCP frame1327 */1328
1329 buff->len = sizeof(structtcphdr);
1330 buff->sk = sk;
1331 buff->localroute = sk->localroute;
1332 t1 =(structtcphdr *) buff->data;
1333
1334 /* 1335 * Put in the IP header and routing stuff. 1336 */1337
1338 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1339 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1340 if (tmp < 0)
1341 {1342 buff->free = 1;
1343 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1344 return;
1345 }1346 buff->len += tmp;
1347 t1 =(structtcphdr *)((char *)t1 +tmp);
1348
1349 memcpy(t1, th, sizeof(*t1));
1350
1351 /*1352 * Swap the send and the receive. 1353 */1354
1355 t1->dest = th->source;
1356 t1->source = th->dest;
1357 t1->seq = ntohl(sequence);
1358 t1->ack = 1;
1359 sk->window = tcp_select_window(sk);
1360 t1->window = ntohs(sk->window);
1361 t1->res1 = 0;
1362 t1->res2 = 0;
1363 t1->rst = 0;
1364 t1->urg = 0;
1365 t1->syn = 0;
1366 t1->psh = 0;
1367 t1->fin = 0;
1368
1369 /*1370 * If we have nothing queued for transmit and the transmit timer1371 * is on we are just doing an ACK timeout and need to switch1372 * to a keepalive.1373 */1374
1375 if (ack == sk->acked_seq)
1376 {1377 sk->ack_backlog = 0;
1378 sk->bytes_rcv = 0;
1379 sk->ack_timed = 0;
1380 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1381 && sk->ip_xmit_timeout == TIME_WRITE)
1382 {1383 if(sk->keepopen) {1384 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1385 }else{1386 delete_timer(sk);
1387 }1388 }1389 }1390
1391 /*1392 * Fill in the packet and send it1393 */1394
1395 t1->ack_seq = ntohl(ack);
1396 t1->doff = sizeof(*t1)/4;
1397 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1398 if (sk->debug)
1399 printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1400 tcp_statistics.TcpOutSegs++;
1401 sk->prot->queue_xmit(sk, dev, buff, 1);
1402 }1403
1404
1405 /* 1406 * This routine builds a generic TCP header. 1407 */1408
1409 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1410 {1411
1412 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1413 th->seq = htonl(sk->write_seq);
1414 th->psh =(push == 0) ? 1 : 0;
1415 th->doff = sizeof(*th)/4;
1416 th->ack = 1;
1417 th->fin = 0;
1418 sk->ack_backlog = 0;
1419 sk->bytes_rcv = 0;
1420 sk->ack_timed = 0;
1421 th->ack_seq = htonl(sk->acked_seq);
1422 sk->window = tcp_select_window(sk);
1423 th->window = htons(sk->window);
1424
1425 return(sizeof(*th));
1426 }1427
1428 /*1429 * This routine copies from a user buffer into a socket,1430 * and starts the transmit system.1431 */1432
1433 staticinttcp_write(structsock *sk, unsignedchar *from,
/* */1434 intlen, intnonblock, unsignedflags)
1435 {1436 intcopied = 0;
1437 intcopy;
1438 inttmp;
1439 structsk_buff *skb;
1440 structsk_buff *send_tmp;
1441 unsignedchar *buff;
1442 structproto *prot;
1443 structdevice *dev = NULL;
1444
1445 sk->inuse=1;
1446 prot = sk->prot;
1447 while(len > 0)
1448 {1449 if (sk->err)
1450 {/* Stop on an error */1451 release_sock(sk);
1452 if (copied)
1453 return(copied);
1454 tmp = -sk->err;
1455 sk->err = 0;
1456 return(tmp);
1457 }1458
1459 /*1460 * First thing we do is make sure that we are established. 1461 */1462
1463 if (sk->shutdown & SEND_SHUTDOWN)
1464 {1465 release_sock(sk);
1466 sk->err = EPIPE;
1467 if (copied)
1468 return(copied);
1469 sk->err = 0;
1470 return(-EPIPE);
1471 }1472
1473 /* 1474 * Wait for a connection to finish.1475 */1476
1477 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1478 {1479 if (sk->err)
1480 {1481 release_sock(sk);
1482 if (copied)
1483 return(copied);
1484 tmp = -sk->err;
1485 sk->err = 0;
1486 return(tmp);
1487 }1488
1489 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1490 {1491 release_sock(sk);
1492 if (copied)
1493 return(copied);
1494
1495 if (sk->err)
1496 {1497 tmp = -sk->err;
1498 sk->err = 0;
1499 return(tmp);
1500 }1501
1502 if (sk->keepopen)
1503 {1504 send_sig(SIGPIPE, current, 0);
1505 }1506 return(-EPIPE);
1507 }1508
1509 if (nonblock || copied)
1510 {1511 release_sock(sk);
1512 if (copied)
1513 return(copied);
1514 return(-EAGAIN);
1515 }1516
1517 release_sock(sk);
1518 cli();
1519
1520 if (sk->state != TCP_ESTABLISHED &&
1521 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1522 {1523 interruptible_sleep_on(sk->sleep);
1524 if (current->signal & ~current->blocked)
1525 {1526 sti();
1527 if (copied)
1528 return(copied);
1529 return(-ERESTARTSYS);
1530 }1531 }1532 sk->inuse = 1;
1533 sti();
1534 }1535
1536 /*1537 * The following code can result in copy <= if sk->mss is ever1538 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1539 * sk->mtu is constant once SYN processing is finished. I.e. we1540 * had better not get here until we've seen his SYN and at least one1541 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1542 * But ESTABLISHED should guarantee that. sk->max_window is by definition1543 * non-decreasing. Note that any ioctl to set user_mss must be done1544 * before the exchange of SYN's. If the initial ack from the other1545 * end has a window of 0, max_window and thus mss will both be 0.1546 */1547
1548 /* 1549 * Now we need to check if we have a half built packet. 1550 */1551
1552 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1553 {1554 inthdrlen;
1555
1556 /* IP header + TCP header */1557 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1558 + sizeof(structtcphdr);
1559
1560 /* Add more stuff to the end of skb->len */1561 if (!(flags & MSG_OOB))
1562 {1563 copy = min(sk->mss - (skb->len - hdrlen), len);
1564 /* FIXME: this is really a bug. */1565 if (copy <= 0)
1566 {1567 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1568 copy = 0;
1569 }1570
1571 memcpy_fromfs(skb->data + skb->len, from, copy);
1572 skb->len += copy;
1573 from += copy;
1574 copied += copy;
1575 len -= copy;
1576 sk->write_seq += copy;
1577 }1578 if ((skb->len - hdrlen) >= sk->mss ||
1579 (flags & MSG_OOB) || !sk->packets_out)
1580 tcp_send_skb(sk, skb);
1581 else1582 tcp_enqueue_partial(skb, sk);
1583 continue;
1584 }1585
1586 /*1587 * We also need to worry about the window.1588 * If window < 1/2 the maximum window we've seen from this1589 * host, don't use it. This is sender side1590 * silly window prevention, as specified in RFC1122.1591 * (Note that this is different than earlier versions of1592 * SWS prevention, e.g. RFC813.). What we actually do is 1593 * use the whole MSS. Since the results in the right1594 * edge of the packet being outside the window, it will1595 * be queued for later rather than sent.1596 */1597
1598 copy = sk->window_seq - sk->write_seq;
1599 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1600 copy = sk->mss;
1601 if (copy > len)
1602 copy = len;
1603
1604 /*1605 * We should really check the window here also. 1606 */1607
1608 send_tmp = NULL;
1609 if (copy < sk->mss && !(flags & MSG_OOB))
1610 {1611 /*1612 * We will release the socket incase we sleep here. 1613 */1614 release_sock(sk);
1615 /*1616 * NB: following must be mtu, because mss can be increased.1617 * mss is always <= mtu 1618 */1619 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1620 sk->inuse = 1;
1621 send_tmp = skb;
1622 }1623 else1624 {1625 /*1626 * We will release the socket incase we sleep here. 1627 */1628 release_sock(sk);
1629 skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1630 sk->inuse = 1;
1631 }1632
1633 /*1634 * If we didn't get any memory, we need to sleep. 1635 */1636
1637 if (skb == NULL)
1638 {1639 sk->socket->flags |= SO_NOSPACE;
1640 if (nonblock)
1641 {1642 release_sock(sk);
1643 if (copied)
1644 return(copied);
1645 return(-EAGAIN);
1646 }1647
1648 /*1649 * FIXME: here is another race condition. 1650 */1651
1652 tmp = sk->wmem_alloc;
1653 release_sock(sk);
1654 cli();
1655 /*1656 * Again we will try to avoid it. 1657 */1658 if (tmp <= sk->wmem_alloc &&
1659 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1660 && sk->err == 0)
1661 {1662 sk->socket->flags &= ~SO_NOSPACE;
1663 interruptible_sleep_on(sk->sleep);
1664 if (current->signal & ~current->blocked)
1665 {1666 sti();
1667 if (copied)
1668 return(copied);
1669 return(-ERESTARTSYS);
1670 }1671 }1672 sk->inuse = 1;
1673 sti();
1674 continue;
1675 }1676
1677 skb->len = 0;
1678 skb->sk = sk;
1679 skb->free = 0;
1680 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1681
1682 buff = skb->data;
1683
1684 /*1685 * FIXME: we need to optimize this.1686 * Perhaps some hints here would be good.1687 */1688
1689 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1690 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1691 if (tmp < 0 )
1692 {1693 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1694 release_sock(sk);
1695 if (copied)
1696 return(copied);
1697 return(tmp);
1698 }1699 skb->len += tmp;
1700 skb->dev = dev;
1701 buff += tmp;
1702 skb->h.th =(structtcphdr *) buff;
1703 tmp = tcp_build_header((structtcphdr *)buff, sk, len-copy);
1704 if (tmp < 0)
1705 {1706 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1707 release_sock(sk);
1708 if (copied)
1709 return(copied);
1710 return(tmp);
1711 }1712
1713 if (flags & MSG_OOB)
1714 {1715 ((structtcphdr *)buff)->urg = 1;
1716 ((structtcphdr *)buff)->urg_ptr = ntohs(copy);
1717 }1718 skb->len += tmp;
1719 memcpy_fromfs(buff+tmp, from, copy);
1720
1721 from += copy;
1722 copied += copy;
1723 len -= copy;
1724 skb->len += copy;
1725 skb->free = 0;
1726 sk->write_seq += copy;
1727
1728 if (send_tmp != NULL && sk->packets_out)
1729 {1730 tcp_enqueue_partial(send_tmp, sk);
1731 continue;
1732 }1733 tcp_send_skb(sk, skb);
1734 }1735 sk->err = 0;
1736
1737 /*1738 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1739 * interactive fast network servers. It's meant to be on and1740 * it really improves the throughput though not the echo time1741 * on my slow slip link - Alan1742 */1743
1744 /*1745 * Avoid possible race on send_tmp - c/o Johannes Stille 1746 */1747
1748 if(sk->partial && ((!sk->packets_out)
1749 /* If not nagling we can send on the before case too.. */1750 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1751 ))
1752 tcp_send_partial(sk);
1753
1754 release_sock(sk);
1755 return(copied);
1756 }1757
1758 /*1759 * This is just a wrapper. 1760 */1761
1762 staticinttcp_sendto(structsock *sk, unsignedchar *from,
/* */1763 intlen, intnonblock, unsignedflags,
1764 structsockaddr_in *addr, intaddr_len)
1765 {1766 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1767 return -EINVAL;
1768 if (sk->state == TCP_CLOSE)
1769 return -ENOTCONN;
1770 if (addr_len < sizeof(*addr))
1771 return -EINVAL;
1772 if (addr->sin_family && addr->sin_family != AF_INET)
1773 return -EINVAL;
1774 if (addr->sin_port != sk->dummy_th.dest)
1775 return -EISCONN;
1776 if (addr->sin_addr.s_addr != sk->daddr)
1777 return -EISCONN;
1778 returntcp_write(sk, from, len, nonblock, flags);
1779 }1780
1781
1782 /*1783 * Send an ack if one is backlogged at this point. Ought to merge1784 * this with tcp_send_ack().1785 */1786
1787 staticvoidtcp_read_wakeup(structsock *sk)
/* */1788 {1789 inttmp;
1790 structdevice *dev = NULL;
1791 structtcphdr *t1;
1792 structsk_buff *buff;
1793
1794 if (!sk->ack_backlog)
1795 return;
1796
1797 /*1798 * FIXME: we need to put code here to prevent this routine from1799 * being called. Being called once in a while is ok, so only check1800 * if this is the second time in a row.1801 */1802
1803 /*1804 * We need to grab some memory, and put together an ack,1805 * and then put it into the queue to be sent.1806 */1807
1808 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1809 if (buff == NULL)
1810 {1811 /* Try again real soon. */1812 reset_xmit_timer(sk, TIME_WRITE, HZ);
1813 return;
1814 }1815
1816 buff->len = sizeof(structtcphdr);
1817 buff->sk = sk;
1818 buff->localroute = sk->localroute;
1819
1820 /*1821 * Put in the IP header and routing stuff. 1822 */1823
1824 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1825 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1826 if (tmp < 0)
1827 {1828 buff->free = 1;
1829 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1830 return;
1831 }1832
1833 buff->len += tmp;
1834 t1 =(structtcphdr *)(buff->data +tmp);
1835
1836 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1837 t1->seq = htonl(sk->sent_seq);
1838 t1->ack = 1;
1839 t1->res1 = 0;
1840 t1->res2 = 0;
1841 t1->rst = 0;
1842 t1->urg = 0;
1843 t1->syn = 0;
1844 t1->psh = 0;
1845 sk->ack_backlog = 0;
1846 sk->bytes_rcv = 0;
1847 sk->window = tcp_select_window(sk);
1848 t1->window = ntohs(sk->window);
1849 t1->ack_seq = ntohl(sk->acked_seq);
1850 t1->doff = sizeof(*t1)/4;
1851 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1852 sk->prot->queue_xmit(sk, dev, buff, 1);
1853 tcp_statistics.TcpOutSegs++;
1854 }1855
1856
1857 /*1858 * FIXME:1859 * This routine frees used buffers.1860 * It should consider sending an ACK to let the1861 * other end know we now have a bigger window.1862 */1863
1864 staticvoidcleanup_rbuf(structsock *sk)
/* */1865 {1866 unsignedlongflags;
1867 unsignedlongleft;
1868 structsk_buff *skb;
1869 unsignedlongrspace;
1870
1871 if(sk->debug)
1872 printk("cleaning rbuf for sk=%p\n", sk);
1873
1874 save_flags(flags);
1875 cli();
1876
1877 left = sk->prot->rspace(sk);
1878
1879 /*1880 * We have to loop through all the buffer headers,1881 * and try to free up all the space we can.1882 */1883
1884 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1885 {1886 if (!skb->used || skb->users)
1887 break;
1888 skb_unlink(skb);
1889 skb->sk = sk;
1890 kfree_skb(skb, FREE_READ);
1891 }1892
1893 restore_flags(flags);
1894
1895 /*1896 * FIXME:1897 * At this point we should send an ack if the difference1898 * in the window, and the amount of space is bigger than1899 * TCP_WINDOW_DIFF.1900 */1901
1902 if(sk->debug)
1903 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1904 left);
1905 if ((rspace=sk->prot->rspace(sk)) != left)
1906 {1907 /*1908 * This area has caused the most trouble. The current strategy1909 * is to simply do nothing if the other end has room to send at1910 * least 3 full packets, because the ack from those will auto-1911 * matically update the window. If the other end doesn't think1912 * we have much space left, but we have room for at least 1 more1913 * complete packet than it thinks we do, we will send an ack1914 * immediately. Otherwise we will wait up to .5 seconds in case1915 * the user reads some more.1916 */1917 sk->ack_backlog++;
1918 /*1919 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1920 * if the other end is offering a window smaller than the agreed on MSS1921 * (called sk->mtu here). In theory there's no connection between send1922 * and receive, and so no reason to think that they're going to send1923 * small packets. For the moment I'm using the hack of reducing the mss1924 * only on the send side, so I'm putting mtu here.1925 */1926
1927 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1928 {1929 /* Send an ack right now. */1930 tcp_read_wakeup(sk);
1931 }1932 else1933 {1934 /* Force it to send an ack soon. */1935 intwas_active = del_timer(&sk->retransmit_timer);
1936 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1937 {1938 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1939 }1940 else1941 add_timer(&sk->retransmit_timer);
1942 }1943 }1944 }1945
1946
1947 /*1948 * Handle reading urgent data. BSD has very simple semantics for1949 * this, no blocking and very strange errors 8)1950 */1951
1952 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */1953 unsignedchar *to, intlen, unsignedflags)
1954 {1955 /*1956 * No URG data to read1957 */1958 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1959 return -EINVAL; /* Yes this is right ! */1960
1961 if (sk->err)
1962 {1963 inttmp = -sk->err;
1964 sk->err = 0;
1965 returntmp;
1966 }1967
1968 if (sk->state == TCP_CLOSE || sk->done)
1969 {1970 if (!sk->done) {1971 sk->done = 1;
1972 return 0;
1973 }1974 return -ENOTCONN;
1975 }1976
1977 if (sk->shutdown & RCV_SHUTDOWN)
1978 {1979 sk->done = 1;
1980 return 0;
1981 }1982 sk->inuse = 1;
1983 if (sk->urg_data & URG_VALID)
1984 {1985 charc = sk->urg_data;
1986 if (!(flags & MSG_PEEK))
1987 sk->urg_data = URG_READ;
1988 put_fs_byte(c, to);
1989 release_sock(sk);
1990 return 1;
1991 }1992 release_sock(sk);
1993
1994 /*1995 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and1996 * the available implementations agree in this case:1997 * this call should never block, independent of the1998 * blocking state of the socket.1999 * Mike <pall@rz.uni-karlsruhe.de>2000 */2001 return -EAGAIN;
2002 }2003
2004
2005 /*2006 * This routine copies from a sock struct into the user buffer. 2007 */2008
2009 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */2010 intlen, intnonblock, unsignedflags)
2011 {2012 structwait_queuewait = {current, NULL};
2013 intcopied = 0;
2014 unsignedlongpeek_seq;
2015 volatileunsignedlong *seq; /* So gcc doesn't overoptimise */2016 unsignedlongused;
2017
2018 /* 2019 * This error should be checked. 2020 */2021
2022 if (sk->state == TCP_LISTEN)
2023 return -ENOTCONN;
2024
2025 /*2026 * Urgent data needs to be handled specially. 2027 */2028
2029 if (flags & MSG_OOB)
2030 returntcp_read_urg(sk, nonblock, to, len, flags);
2031
2032 /*2033 * Copying sequence to update. This is volatile to handle2034 * the multi-reader case neatly (memcpy_to/fromfs might be 2035 * inline and thus not flush cached variables otherwise).2036 */2037
2038 peek_seq = sk->copied_seq;
2039 seq = &sk->copied_seq;
2040 if (flags & MSG_PEEK)
2041 seq = &peek_seq;
2042
2043 add_wait_queue(sk->sleep, &wait);
2044 sk->inuse = 1;
2045 while (len > 0)
2046 {2047 structsk_buff * skb;
2048 unsignedlongoffset;
2049
2050 /*2051 * Are we at urgent data? Stop if we have read anything.2052 */2053
2054 if (copied && sk->urg_data && sk->urg_seq == *seq)
2055 break;
2056
2057 /*2058 * Next get a buffer.2059 */2060
2061 current->state = TASK_INTERRUPTIBLE;
2062
2063 skb = skb_peek(&sk->receive_queue);
2064 do2065 {2066 if (!skb)
2067 break;
2068 if (before(*seq, skb->h.th->seq))
2069 break;
2070 offset = *seq - skb->h.th->seq;
2071 if (skb->h.th->syn)
2072 offset--;
2073 if (offset < skb->len)
2074 gotofound_ok_skb;
2075 if (skb->h.th->fin)
2076 gotofound_fin_ok;
2077 if (!(flags & MSG_PEEK))
2078 skb->used = 1;
2079 skb = skb->next;
2080 }2081 while (skb != (structsk_buff *)&sk->receive_queue);
2082
2083 if (copied)
2084 break;
2085
2086 if (sk->err)
2087 {2088 copied = -sk->err;
2089 sk->err = 0;
2090 break;
2091 }2092
2093 if (sk->state == TCP_CLOSE)
2094 {2095 if (!sk->done)
2096 {2097 sk->done = 1;
2098 break;
2099 }2100 copied = -ENOTCONN;
2101 break;
2102 }2103
2104 if (sk->shutdown & RCV_SHUTDOWN)
2105 {2106 sk->done = 1;
2107 break;
2108 }2109
2110 if (nonblock)
2111 {2112 copied = -EAGAIN;
2113 break;
2114 }2115
2116 cleanup_rbuf(sk);
2117 release_sock(sk);
2118 sk->socket->flags |= SO_WAITDATA;
2119 schedule();
2120 sk->socket->flags &= ~SO_WAITDATA;
2121 sk->inuse = 1;
2122
2123 if (current->signal & ~current->blocked)
2124 {2125 copied = -ERESTARTSYS;
2126 break;
2127 }2128 continue;
2129
2130 found_ok_skb:
2131 /*2132 * Lock the buffer. We can be fairly relaxed as2133 * an interrupt will never steal a buffer we are 2134 * using unless I've missed something serious in2135 * tcp_data.2136 */2137
2138 skb->users++;
2139
2140 /*2141 * Ok so how much can we use ? 2142 */2143
2144 used = skb->len - offset;
2145 if (len < used)
2146 used = len;
2147 /*2148 * Do we have urgent data here? 2149 */2150
2151 if (sk->urg_data)
2152 {2153 unsignedlongurg_offset = sk->urg_seq - *seq;
2154 if (urg_offset < used)
2155 {2156 if (!urg_offset)
2157 {2158 if (!sk->urginline)
2159 {2160 ++*seq;
2161 offset++;
2162 used--;
2163 }2164 }2165 else2166 used = urg_offset;
2167 }2168 }2169
2170 /*2171 * Copy it - We _MUST_ update *seq first so that we2172 * don't ever double read when we have dual readers2173 */2174
2175 *seq += used;
2176
2177 /*2178 * This memcpy_tofs can sleep. If it sleeps and we2179 * do a second read it relies on the skb->users to avoid2180 * a crash when cleanup_rbuf() gets called.2181 */2182
2183 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2184 skb->h.th->doff*4 + offset, used);
2185 copied += used;
2186 len -= used;
2187 to += used;
2188
2189 /*2190 * We now will not sleep again until we are finished2191 * with skb. Sorry if you are doing the SMP port2192 * but you'll just have to fix it neatly ;)2193 */2194
2195 skb->users --;
2196
2197 if (after(sk->copied_seq,sk->urg_seq))
2198 sk->urg_data = 0;
2199 if (used + offset < skb->len)
2200 continue;
2201
2202 /*2203 * Process the FIN.2204 */2205
2206 if (skb->h.th->fin)
2207 gotofound_fin_ok;
2208 if (flags & MSG_PEEK)
2209 continue;
2210 skb->used = 1;
2211 continue;
2212
2213 found_fin_ok:
2214 ++*seq;
2215 if (flags & MSG_PEEK)
2216 break;
2217
2218 /*2219 * All is done2220 */2221
2222 skb->used = 1;
2223 sk->shutdown |= RCV_SHUTDOWN;
2224 break;
2225
2226 }2227 remove_wait_queue(sk->sleep, &wait);
2228 current->state = TASK_RUNNING;
2229
2230 /* Clean up data we have read: This will do ACK frames */2231 cleanup_rbuf(sk);
2232 release_sock(sk);
2233 returncopied;
2234 }2235
2236 /*2237 * State processing on a close. This implements the state shift for2238 * sending our FIN frame. Note that we only send a FIN for some 2239 * states. A shutdown() may have already sent the FIN, or we may be2240 * closed.2241 */2242
2243 staticinttcp_close_state(structsock *sk, intdead)
/* */2244 {2245 intns=TCP_CLOSE;
2246 intsend_fin=0;
2247 switch(sk->state)
2248 {2249 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2250 break;
2251 caseTCP_SYN_RECV:
2252 caseTCP_ESTABLISHED: /* Closedown begin */2253 ns=TCP_FIN_WAIT1;
2254 send_fin=1;
2255 break;
2256 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2257 caseTCP_FIN_WAIT2:
2258 caseTCP_CLOSING:
2259 ns=sk->state;
2260 break;
2261 caseTCP_CLOSE:
2262 caseTCP_LISTEN:
2263 break;
2264 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2265 wait only for the ACK */2266 ns=TCP_LAST_ACK;
2267 send_fin=1;
2268 }2269
2270 tcp_set_state(sk,ns);
2271
2272 /*2273 * This is a (useful) BSD violating of the RFC. There is a2274 * problem with TCP as specified in that the other end could2275 * keep a socket open forever with no application left this end.2276 * We use a 3 minute timeout (about the same as BSD) then kill2277 * our end. If they send after that then tough - BUT: long enough2278 * that we won't make the old 4*rto = almost no time - whoops2279 * reset mistake.2280 */2281 if(dead && ns==TCP_FIN_WAIT2)
2282 {2283 inttimer_active=del_timer(&sk->timer);
2284 if(timer_active)
2285 add_timer(&sk->timer);
2286 else2287 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2288 }2289
2290 returnsend_fin;
2291 }2292
2293 /*2294 * Send a fin.2295 */2296
2297 staticvoidtcp_send_fin(structsock *sk)
/* */2298 {2299 structproto *prot =(structproto *)sk->prot;
2300 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2301 structtcphdr *t1;
2302 structsk_buff *buff;
2303 structdevice *dev=NULL;
2304 inttmp;
2305
2306 release_sock(sk); /* in case the malloc sleeps. */2307
2308 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2309 sk->inuse = 1;
2310
2311 if (buff == NULL)
2312 {2313 /* This is a disaster if it occurs */2314 printk("tcp_send_fin: Impossible malloc failure");
2315 return;
2316 }2317
2318 /*2319 * Administrivia2320 */2321
2322 buff->sk = sk;
2323 buff->len = sizeof(*t1);
2324 buff->localroute = sk->localroute;
2325 t1 =(structtcphdr *) buff->data;
2326
2327 /*2328 * Put in the IP header and routing stuff. 2329 */2330
2331 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2332 IPPROTO_TCP, sk->opt,
2333 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2334 if (tmp < 0)
2335 {2336 intt;
2337 /*2338 * Finish anyway, treat this as a send that got lost. 2339 * (Not good).2340 */2341
2342 buff->free = 1;
2343 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2344 sk->write_seq++;
2345 t=del_timer(&sk->timer);
2346 if(t)
2347 add_timer(&sk->timer);
2348 else2349 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2350 return;
2351 }2352
2353 /*2354 * We ought to check if the end of the queue is a buffer and2355 * if so simply add the fin to that buffer, not send it ahead.2356 */2357
2358 t1 =(structtcphdr *)((char *)t1 +tmp);
2359 buff->len += tmp;
2360 buff->dev = dev;
2361 memcpy(t1, th, sizeof(*t1));
2362 t1->seq = ntohl(sk->write_seq);
2363 sk->write_seq++;
2364 buff->h.seq = sk->write_seq;
2365 t1->ack = 1;
2366 t1->ack_seq = ntohl(sk->acked_seq);
2367 t1->window = ntohs(sk->window=tcp_select_window(sk));
2368 t1->fin = 1;
2369 t1->rst = 0;
2370 t1->doff = sizeof(*t1)/4;
2371 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2372
2373 /*2374 * If there is data in the write queue, the fin must be appended to2375 * the write queue.2376 */2377
2378 if (skb_peek(&sk->write_queue) != NULL)
2379 {2380 buff->free = 0;
2381 if (buff->next != NULL)
2382 {2383 printk("tcp_send_fin: next != NULL\n");
2384 skb_unlink(buff);
2385 }2386 skb_queue_tail(&sk->write_queue, buff);
2387 }2388 else2389 {2390 sk->sent_seq = sk->write_seq;
2391 sk->prot->queue_xmit(sk, dev, buff, 0);
2392 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2393 }2394 }2395
2396 /*2397 * Shutdown the sending side of a connection. Much like close except2398 * that we don't receive shut down or set sk->dead=1.2399 */2400
2401 voidtcp_shutdown(structsock *sk, inthow)
/* */2402 {2403 /*2404 * We need to grab some memory, and put together a FIN,2405 * and then put it into the queue to be sent.2406 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2407 */2408
2409 if (!(how & SEND_SHUTDOWN))
2410 return;
2411
2412 /*2413 * If we've already sent a FIN, or its a closed state2414 */2415
2416 if (sk->state == TCP_FIN_WAIT1 ||
2417 sk->state == TCP_FIN_WAIT2 ||
2418 sk->state == TCP_CLOSING ||
2419 sk->state == TCP_LAST_ACK ||
2420 sk->state == TCP_TIME_WAIT ||
2421 sk->state == TCP_CLOSE ||
2422 sk->state == TCP_LISTEN2423 )
2424 {2425 return;
2426 }2427 sk->inuse = 1;
2428
2429 /*2430 * flag that the sender has shutdown2431 */2432
2433 sk->shutdown |= SEND_SHUTDOWN;
2434
2435 /*2436 * Clear out any half completed packets. 2437 */2438
2439 if (sk->partial)
2440 tcp_send_partial(sk);
2441
2442 /*2443 * FIN if needed2444 */2445
2446 if(tcp_close_state(sk,0))
2447 tcp_send_fin(sk);
2448
2449 release_sock(sk);
2450 }2451
2452
2453 staticint2454 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2455 intto_len, intnonblock, unsignedflags,
2456 structsockaddr_in *addr, int *addr_len)
2457 {2458 intresult;
2459
2460 /* 2461 * Have to check these first unlike the old code. If 2462 * we check them after we lose data on an error2463 * which is wrong 2464 */2465
2466 if(addr_len)
2467 *addr_len = sizeof(*addr);
2468 result=tcp_read(sk, to, to_len, nonblock, flags);
2469
2470 if (result < 0)
2471 return(result);
2472
2473 if(addr)
2474 {2475 addr->sin_family = AF_INET;
2476 addr->sin_port = sk->dummy_th.dest;
2477 addr->sin_addr.s_addr = sk->daddr;
2478 }2479 return(result);
2480 }2481
2482
2483 /*2484 * This routine will send an RST to the other tcp. 2485 */2486
2487 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2488 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2489 {2490 structsk_buff *buff;
2491 structtcphdr *t1;
2492 inttmp;
2493 structdevice *ndev=NULL;
2494
2495 /*2496 * Cannot reset a reset (Think about it).2497 */2498
2499 if(th->rst)
2500 return;
2501
2502 /*2503 * We need to grab some memory, and put together an RST,2504 * and then put it into the queue to be sent.2505 */2506
2507 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2508 if (buff == NULL)
2509 return;
2510
2511 buff->len = sizeof(*t1);
2512 buff->sk = NULL;
2513 buff->dev = dev;
2514 buff->localroute = 0;
2515
2516 t1 =(structtcphdr *) buff->data;
2517
2518 /*2519 * Put in the IP header and routing stuff. 2520 */2521
2522 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2523 sizeof(structtcphdr),tos,ttl);
2524 if (tmp < 0)
2525 {2526 buff->free = 1;
2527 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2528 return;
2529 }2530
2531 t1 =(structtcphdr *)((char *)t1 +tmp);
2532 buff->len += tmp;
2533 memcpy(t1, th, sizeof(*t1));
2534
2535 /*2536 * Swap the send and the receive. 2537 */2538
2539 t1->dest = th->source;
2540 t1->source = th->dest;
2541 t1->rst = 1;
2542 t1->window = 0;
2543
2544 if(th->ack)
2545 {2546 t1->ack = 0;
2547 t1->seq = th->ack_seq;
2548 t1->ack_seq = 0;
2549 }2550 else2551 {2552 t1->ack = 1;
2553 if(!th->syn)
2554 t1->ack_seq=htonl(th->seq);
2555 else2556 t1->ack_seq=htonl(th->seq+1);
2557 t1->seq=0;
2558 }2559
2560 t1->syn = 0;
2561 t1->urg = 0;
2562 t1->fin = 0;
2563 t1->psh = 0;
2564 t1->doff = sizeof(*t1)/4;
2565 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2566 prot->queue_xmit(NULL, ndev, buff, 1);
2567 tcp_statistics.TcpOutSegs++;
2568 }2569
2570
2571 /*2572 * Look for tcp options. Parses everything but only knows about MSS.2573 * This routine is always called with the packet containing the SYN.2574 * However it may also be called with the ack to the SYN. So you2575 * can't assume this is always the SYN. It's always called after2576 * we have set up sk->mtu to our own MTU.2577 *2578 * We need at minimum to add PAWS support here. Possibly large windows2579 * as Linux gets deployed on 100Mb/sec networks.2580 */2581
2582 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2583 {2584 unsignedchar *ptr;
2585 intlength=(th->doff*4)-sizeof(structtcphdr);
2586 intmss_seen = 0;
2587
2588 ptr = (unsignedchar *)(th + 1);
2589
2590 while(length>0)
2591 {2592 intopcode=*ptr++;
2593 intopsize=*ptr++;
2594 switch(opcode)
2595 {2596 caseTCPOPT_EOL:
2597 return;
2598 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2599 length--;
2600 ptr--; /* the opsize=*ptr++ above was a mistake */2601 continue;
2602
2603 default:
2604 if(opsize<=2) /* Avoid silly options looping forever */2605 return;
2606 switch(opcode)
2607 {2608 caseTCPOPT_MSS:
2609 if(opsize==4 && th->syn)
2610 {2611 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2612 mss_seen = 1;
2613 }2614 break;
2615 /* Add other options here as people feel the urge to implement stuff like large windows */2616 }2617 ptr+=opsize-2;
2618 length-=opsize;
2619 }2620 }2621 if (th->syn)
2622 {2623 if (! mss_seen)
2624 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2625 }2626 #ifdefCONFIG_INET_PCTCP2627 sk->mss = min(sk->max_window >> 1, sk->mtu);
2628 #else2629 sk->mss = min(sk->max_window, sk->mtu);
2630 #endif2631 }2632
2633 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2634 {2635 dst = ntohl(dst);
2636 if (IN_CLASSA(dst))
2637 returnhtonl(IN_CLASSA_NET);
2638 if (IN_CLASSB(dst))
2639 returnhtonl(IN_CLASSB_NET);
2640 returnhtonl(IN_CLASSC_NET);
2641 }2642
2643 /*2644 * Default sequence number picking algorithm.2645 */2646
2647 externinlinelongtcp_init_seq(void)
/* */2648 {2649 returnjiffies * SEQ_TICK - seq_offset;
2650 }2651
2652 /*2653 * This routine handles a connection request.2654 * It should make sure we haven't already responded.2655 * Because of the way BSD works, we have to send a syn/ack now.2656 * This also means it will be harder to close a socket which is2657 * listening.2658 */2659
2660 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2661 unsignedlongdaddr, unsignedlongsaddr,
2662 structoptions *opt, structdevice *dev, unsignedlongseq)
2663 {2664 structsk_buff *buff;
2665 structtcphdr *t1;
2666 unsignedchar *ptr;
2667 structsock *newsk;
2668 structtcphdr *th;
2669 structdevice *ndev=NULL;
2670 inttmp;
2671 structrtable *rt;
2672
2673 th = skb->h.th;
2674
2675 /* If the socket is dead, don't accept the connection. */2676 if (!sk->dead)
2677 {2678 sk->data_ready(sk,0);
2679 }2680 else2681 {2682 if(sk->debug)
2683 printk("Reset on %p: Connect on dead socket.\n",sk);
2684 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2685 tcp_statistics.TcpAttemptFails++;
2686 kfree_skb(skb, FREE_READ);
2687 return;
2688 }2689
2690 /*2691 * Make sure we can accept more. This will prevent a2692 * flurry of syns from eating up all our memory.2693 */2694
2695 if (sk->ack_backlog >= sk->max_ack_backlog)
2696 {2697 tcp_statistics.TcpAttemptFails++;
2698 kfree_skb(skb, FREE_READ);
2699 return;
2700 }2701
2702 /*2703 * We need to build a new sock struct.2704 * It is sort of bad to have a socket without an inode attached2705 * to it, but the wake_up's will just wake up the listening socket,2706 * and if the listening socket is destroyed before this is taken2707 * off of the queue, this will take care of it.2708 */2709
2710 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2711 if (newsk == NULL)
2712 {2713 /* just ignore the syn. It will get retransmitted. */2714 tcp_statistics.TcpAttemptFails++;
2715 kfree_skb(skb, FREE_READ);
2716 return;
2717 }2718
2719 memcpy(newsk, sk, sizeof(*newsk));
2720 skb_queue_head_init(&newsk->write_queue);
2721 skb_queue_head_init(&newsk->receive_queue);
2722 newsk->send_head = NULL;
2723 newsk->send_tail = NULL;
2724 skb_queue_head_init(&newsk->back_log);
2725 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2726 newsk->rto = TCP_TIMEOUT_INIT;
2727 newsk->mdev = 0;
2728 newsk->max_window = 0;
2729 newsk->cong_window = 1;
2730 newsk->cong_count = 0;
2731 newsk->ssthresh = 0;
2732 newsk->backoff = 0;
2733 newsk->blog = 0;
2734 newsk->intr = 0;
2735 newsk->proc = 0;
2736 newsk->done = 0;
2737 newsk->partial = NULL;
2738 newsk->pair = NULL;
2739 newsk->wmem_alloc = 0;
2740 newsk->rmem_alloc = 0;
2741 newsk->localroute = sk->localroute;
2742
2743 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2744
2745 newsk->err = 0;
2746 newsk->shutdown = 0;
2747 newsk->ack_backlog = 0;
2748 newsk->acked_seq = skb->h.th->seq+1;
2749 newsk->copied_seq = skb->h.th->seq+1;
2750 newsk->fin_seq = skb->h.th->seq;
2751 newsk->state = TCP_SYN_RECV;
2752 newsk->timeout = 0;
2753 newsk->ip_xmit_timeout = 0;
2754 newsk->write_seq = seq;
2755 newsk->window_seq = newsk->write_seq;
2756 newsk->rcv_ack_seq = newsk->write_seq;
2757 newsk->urg_data = 0;
2758 newsk->retransmits = 0;
2759 newsk->linger=0;
2760 newsk->destroy = 0;
2761 init_timer(&newsk->timer);
2762 init_timer(&newsk->retransmit_timer);
2763 newsk->timer.data = (unsignedlong)newsk;
2764 newsk->timer.function = &net_timer;
2765 newsk->retransmit_timer.data = (unsignedlong)newsk;
2766 newsk->retransmit_timer.function=&retransmit_timer;
2767 newsk->dummy_th.source = skb->h.th->dest;
2768 newsk->dummy_th.dest = skb->h.th->source;
2769
2770 /*2771 * Swap these two, they are from our point of view. 2772 */2773
2774 newsk->daddr = saddr;
2775 newsk->saddr = daddr;
2776
2777 put_sock(newsk->num,newsk);
2778 newsk->dummy_th.res1 = 0;
2779 newsk->dummy_th.doff = 6;
2780 newsk->dummy_th.fin = 0;
2781 newsk->dummy_th.syn = 0;
2782 newsk->dummy_th.rst = 0;
2783 newsk->dummy_th.psh = 0;
2784 newsk->dummy_th.ack = 0;
2785 newsk->dummy_th.urg = 0;
2786 newsk->dummy_th.res2 = 0;
2787 newsk->acked_seq = skb->h.th->seq + 1;
2788 newsk->copied_seq = skb->h.th->seq + 1;
2789 newsk->socket = NULL;
2790
2791 /*2792 * Grab the ttl and tos values and use them 2793 */2794
2795 newsk->ip_ttl=sk->ip_ttl;
2796 newsk->ip_tos=skb->ip_hdr->tos;
2797
2798 /*2799 * Use 512 or whatever user asked for 2800 */2801
2802 /*2803 * Note use of sk->user_mss, since user has no direct access to newsk 2804 */2805
2806 rt=ip_rt_route(saddr, NULL,NULL);
2807
2808 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2809 newsk->window_clamp = rt->rt_window;
2810 else2811 newsk->window_clamp = 0;
2812
2813 if (sk->user_mss)
2814 newsk->mtu = sk->user_mss;
2815 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
2816 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2817 else2818 {2819 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */2820 if ((saddr ^ daddr) & default_mask(saddr))
2821 #else2822 if ((saddr ^ daddr) & dev->pa_mask)
2823 #endif2824 newsk->mtu = 576 - HEADER_SIZE;
2825 else2826 newsk->mtu = MAX_WINDOW;
2827 }2828
2829 /*2830 * But not bigger than device MTU 2831 */2832
2833 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2834
2835 /*2836 * This will min with what arrived in the packet 2837 */2838
2839 tcp_options(newsk,skb->h.th);
2840
2841 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2842 if (buff == NULL)
2843 {2844 sk->err = -ENOMEM;
2845 newsk->dead = 1;
2846 release_sock(newsk);
2847 kfree_skb(skb, FREE_READ);
2848 tcp_statistics.TcpAttemptFails++;
2849 return;
2850 }2851
2852 buff->len = sizeof(structtcphdr)+4;
2853 buff->sk = newsk;
2854 buff->localroute = newsk->localroute;
2855
2856 t1 =(structtcphdr *) buff->data;
2857
2858 /*2859 * Put in the IP header and routing stuff. 2860 */2861
2862 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2863 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2864
2865 /*2866 * Something went wrong. 2867 */2868
2869 if (tmp < 0)
2870 {2871 sk->err = tmp;
2872 buff->free = 1;
2873 kfree_skb(buff,FREE_WRITE);
2874 newsk->dead = 1;
2875 release_sock(newsk);
2876 skb->sk = sk;
2877 kfree_skb(skb, FREE_READ);
2878 tcp_statistics.TcpAttemptFails++;
2879 return;
2880 }2881
2882 buff->len += tmp;
2883 t1 =(structtcphdr *)((char *)t1 +tmp);
2884
2885 memcpy(t1, skb->h.th, sizeof(*t1));
2886 buff->h.seq = newsk->write_seq;
2887 /*2888 * Swap the send and the receive. 2889 */2890 t1->dest = skb->h.th->source;
2891 t1->source = newsk->dummy_th.source;
2892 t1->seq = ntohl(newsk->write_seq++);
2893 t1->ack = 1;
2894 newsk->window = tcp_select_window(newsk);
2895 newsk->sent_seq = newsk->write_seq;
2896 t1->window = ntohs(newsk->window);
2897 t1->res1 = 0;
2898 t1->res2 = 0;
2899 t1->rst = 0;
2900 t1->urg = 0;
2901 t1->psh = 0;
2902 t1->syn = 1;
2903 t1->ack_seq = ntohl(skb->h.th->seq+1);
2904 t1->doff = sizeof(*t1)/4+1;
2905 ptr =(unsignedchar *)(t1+1);
2906 ptr[0] = 2;
2907 ptr[1] = 4;
2908 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2909 ptr[3] =(newsk->mtu) & 0xff;
2910
2911 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2912 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2913 reset_xmit_timer(newsk, TIME_WRITE, newsk->rto);
2914
2915 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2916 skb->sk = newsk;
2917
2918 /*2919 * Charge the sock_buff to newsk. 2920 */2921
2922 sk->rmem_alloc -= skb->mem_len;
2923 newsk->rmem_alloc += skb->mem_len;
2924
2925 skb_queue_tail(&sk->receive_queue,skb);
2926 sk->ack_backlog++;
2927 release_sock(newsk);
2928 tcp_statistics.TcpOutSegs++;
2929 }2930
2931
2932 staticvoidtcp_close(structsock *sk, inttimeout)
/* */2933 {2934 /*2935 * We need to grab some memory, and put together a FIN, 2936 * and then put it into the queue to be sent.2937 */2938
2939 sk->inuse = 1;
2940
2941 if(sk->state == TCP_LISTEN)
2942 {2943 /* Special case */2944 tcp_set_state(sk, TCP_CLOSE);
2945 tcp_close_pending(sk);
2946 release_sock(sk);
2947 return;
2948 }2949
2950 sk->keepopen = 1;
2951 sk->shutdown = SHUTDOWN_MASK;
2952
2953 if (!sk->dead)
2954 sk->state_change(sk);
2955
2956 if (timeout == 0)
2957 {2958 structsk_buff *skb;
2959
2960 /*2961 * We need to flush the recv. buffs. We do this only on the2962 * descriptor close, not protocol-sourced closes, because the2963 * reader process may not have drained the data yet!2964 */2965
2966 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2967 kfree_skb(skb, FREE_READ);
2968 /*2969 * Get rid off any half-completed packets. 2970 */2971
2972 if (sk->partial)
2973 tcp_send_partial(sk);
2974 }2975
2976
2977 /*2978 * Timeout is not the same thing - however the code likes2979 * to send both the same way (sigh).2980 */2981
2982 if(timeout)
2983 {2984 tcp_set_state(sk, TCP_CLOSE); /* Dead */2985 }2986 else2987 {2988 if(tcp_close_state(sk,1)==1)
2989 {2990 tcp_send_fin(sk);
2991 }2992 }2993 release_sock(sk);
2994 }2995
2996
2997 /*2998 * This routine takes stuff off of the write queue,2999 * and puts it in the xmit queue. This happens as incoming acks3000 * open up the remote window for us.3001 */3002
3003 staticvoidtcp_write_xmit(structsock *sk)
/* */3004 {3005 structsk_buff *skb;
3006
3007 /*3008 * The bytes will have to remain here. In time closedown will3009 * empty the write queue and all will be happy 3010 */3011
3012 if(sk->zapped)
3013 return;
3014
3015 /*3016 * Anything on the transmit queue that fits the window can3017 * be added providing we are not3018 *3019 * a) retransmitting (Nagle's rule)3020 * b) exceeding our congestion window.3021 */3022
3023 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3024 before(skb->h.seq, sk->window_seq + 1) &&
3025 (sk->retransmits == 0 ||
3026 sk->ip_xmit_timeout != TIME_WRITE ||
3027 before(skb->h.seq, sk->rcv_ack_seq + 1))
3028 && sk->packets_out < sk->cong_window)
3029 {3030 IS_SKB(skb);
3031 skb_unlink(skb);
3032
3033 /*3034 * See if we really need to send the packet. 3035 */3036
3037 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3038 {3039 /*3040 * This is acked data. We can discard it. This 3041 * cannot currently occur.3042 */3043
3044 sk->retransmits = 0;
3045 kfree_skb(skb, FREE_WRITE);
3046 if (!sk->dead)
3047 sk->write_space(sk);
3048 }3049 else3050 {3051 structtcphdr *th;
3052 structiphdr *iph;
3053 intsize;
3054 /*3055 * put in the ack seq and window at this point rather than earlier,3056 * in order to keep them monotonic. We really want to avoid taking3057 * back window allocations. That's legal, but RFC1122 says it's frowned on.3058 * Ack and window will in general have changed since this packet was put3059 * on the write queue.3060 */3061 iph = (structiphdr *)(skb->data +
3062 skb->dev->hard_header_len);
3063 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3064 size = skb->len - (((unsignedchar *) th) - skb->data);
3065
3066 th->ack_seq = ntohl(sk->acked_seq);
3067 th->window = ntohs(tcp_select_window(sk));
3068
3069 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3070
3071 sk->sent_seq = skb->h.seq;
3072
3073 /*3074 * IP manages our queue for some crazy reason3075 */3076
3077 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3078
3079 /*3080 * Again we slide the timer wrongly3081 */3082
3083 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3084 }3085 }3086 }3087
3088
3089 /*3090 * This routine deals with incoming acks, but not outgoing ones.3091 */3092
3093 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3094 {3095 unsignedlongack;
3096 intflag = 0;
3097
3098 /* 3099 * 1 - there was data in packet as well as ack or new data is sent or 3100 * in shutdown state3101 * 2 - data from retransmit queue was acked and removed3102 * 4 - window shrunk or data from retransmit queue was acked and removed3103 */3104
3105 if(sk->zapped)
3106 return(1); /* Dead, cant ack any more so why bother */3107
3108 /*3109 * Have we discovered a larger window3110 */3111
3112 ack = ntohl(th->ack_seq);
3113
3114 if (ntohs(th->window) > sk->max_window)
3115 {3116 sk->max_window = ntohs(th->window);
3117 #ifdefCONFIG_INET_PCTCP3118 /* Hack because we don't send partial packets to non SWS3119 handling hosts */3120 sk->mss = min(sk->max_window>>1, sk->mtu);
3121 #else3122 sk->mss = min(sk->max_window, sk->mtu);
3123 #endif3124 }3125
3126 /*3127 * We have dropped back to keepalive timeouts. Thus we have3128 * no retransmits pending.3129 */3130
3131 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3132 sk->retransmits = 0;
3133
3134 /*3135 * If the ack is newer than sent or older than previous acks3136 * then we can probably ignore it.3137 */3138
3139 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3140 {3141 if(sk->debug)
3142 printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3143
3144 /*3145 * Keepalive processing.3146 */3147
3148 if (after(ack, sk->sent_seq))
3149 {3150 return(0);
3151 }3152
3153 /*3154 * Restart the keepalive timer.3155 */3156
3157 if (sk->keepopen)
3158 {3159 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3160 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3161 }3162 return(1);
3163 }3164
3165 /*3166 * If there is data set flag 13167 */3168
3169 if (len != th->doff*4)
3170 flag |= 1;
3171
3172 /*3173 * See if our window has been shrunk. 3174 */3175
3176 if (after(sk->window_seq, ack+ntohs(th->window)))
3177 {3178 /*3179 * We may need to move packets from the send queue3180 * to the write queue, if the window has been shrunk on us.3181 * The RFC says you are not allowed to shrink your window3182 * like this, but if the other end does, you must be able3183 * to deal with it.3184 */3185 structsk_buff *skb;
3186 structsk_buff *skb2;
3187 structsk_buff *wskb = NULL;
3188
3189 skb2 = sk->send_head;
3190 sk->send_head = NULL;
3191 sk->send_tail = NULL;
3192
3193 /*3194 * This is an artifact of a flawed concept. We want one3195 * queue and a smarter send routine when we send all.3196 */3197
3198 flag |= 4; /* Window changed */3199
3200 sk->window_seq = ack + ntohs(th->window);
3201 cli();
3202 while (skb2 != NULL)
3203 {3204 skb = skb2;
3205 skb2 = skb->link3;
3206 skb->link3 = NULL;
3207 if (after(skb->h.seq, sk->window_seq))
3208 {3209 if (sk->packets_out > 0)
3210 sk->packets_out--;
3211 /* We may need to remove this from the dev send list. */3212 if (skb->next != NULL)
3213 {3214 skb_unlink(skb);
3215 }3216 /* Now add it to the write_queue. */3217 if (wskb == NULL)
3218 skb_queue_head(&sk->write_queue,skb);
3219 else3220 skb_append(wskb,skb);
3221 wskb = skb;
3222 }3223 else3224 {3225 if (sk->send_head == NULL)
3226 {3227 sk->send_head = skb;
3228 sk->send_tail = skb;
3229 }3230 else3231 {3232 sk->send_tail->link3 = skb;
3233 sk->send_tail = skb;
3234 }3235 skb->link3 = NULL;
3236 }3237 }3238 sti();
3239 }3240
3241 /*3242 * Pipe has emptied3243 */3244
3245 if (sk->send_tail == NULL || sk->send_head == NULL)
3246 {3247 sk->send_head = NULL;
3248 sk->send_tail = NULL;
3249 sk->packets_out= 0;
3250 }3251
3252 /*3253 * Update the right hand window edge of the host3254 */3255
3256 sk->window_seq = ack + ntohs(th->window);
3257
3258 /*3259 * We don't want too many packets out there. 3260 */3261
3262 if (sk->ip_xmit_timeout == TIME_WRITE &&
3263 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3264 {3265 /* 3266 * This is Jacobson's slow start and congestion avoidance. 3267 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3268 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3269 * counter and increment it once every cwnd times. It's possible3270 * that this should be done only if sk->retransmits == 0. I'm3271 * interpreting "new data is acked" as including data that has3272 * been retransmitted but is just now being acked.3273 */3274 if (sk->cong_window < sk->ssthresh)
3275 /* 3276 * In "safe" area, increase3277 */3278 sk->cong_window++;
3279 else3280 {3281 /*3282 * In dangerous area, increase slowly. In theory this is3283 * sk->cong_window += 1 / sk->cong_window3284 */3285 if (sk->cong_count >= sk->cong_window)
3286 {3287 sk->cong_window++;
3288 sk->cong_count = 0;
3289 }3290 else3291 sk->cong_count++;
3292 }3293 }3294
3295 /*3296 * Remember the highest ack received.3297 */3298
3299 sk->rcv_ack_seq = ack;
3300
3301 /*3302 * If this ack opens up a zero window, clear backoff. It was3303 * being used to time the probes, and is probably far higher than3304 * it needs to be for normal retransmission.3305 */3306
3307 if (sk->ip_xmit_timeout == TIME_PROBE0)
3308 {3309 sk->retransmits = 0; /* Our probe was answered */3310
3311 /*3312 * Was it a usable window open ?3313 */3314
3315 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3316 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3317 {3318 sk->backoff = 0;
3319
3320 /*3321 * Recompute rto from rtt. this eliminates any backoff.3322 */3323
3324 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3325 if (sk->rto > 120*HZ)
3326 sk->rto = 120*HZ;
3327 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3328 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3329 .2 of a second is going to need huge windows (SIGH) */3330 sk->rto = 20;
3331 }3332 }3333
3334 /* 3335 * See if we can take anything off of the retransmit queue.3336 */3337
3338 while(sk->send_head != NULL)
3339 {3340 /* Check for a bug. */3341 if (sk->send_head->link3 &&
3342 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3343 printk("INET: tcp.c: *** bug send_list out of order.\n");
3344
3345 /*3346 * If our packet is before the ack sequence we can3347 * discard it as its confirmed to have arrived the other end.3348 */3349
3350 if (before(sk->send_head->h.seq, ack+1))
3351 {3352 structsk_buff *oskb;
3353 if (sk->retransmits)
3354 {3355 /*3356 * We were retransmitting. don't count this in RTT est 3357 */3358 flag |= 2;
3359
3360 /*3361 * even though we've gotten an ack, we're still3362 * retransmitting as long as we're sending from3363 * the retransmit queue. Keeping retransmits non-zero3364 * prevents us from getting new data interspersed with3365 * retransmissions.3366 */3367
3368 if (sk->send_head->link3) /* Any more queued retransmits? */3369 sk->retransmits = 1;
3370 else3371 sk->retransmits = 0;
3372 }3373 /*3374 * Note that we only reset backoff and rto in the3375 * rtt recomputation code. And that doesn't happen3376 * if there were retransmissions in effect. So the3377 * first new packet after the retransmissions is3378 * sent with the backoff still in effect. Not until3379 * we get an ack from a non-retransmitted packet do3380 * we reset the backoff and rto. This allows us to deal3381 * with a situation where the network delay has increased3382 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3383 */3384
3385 /*3386 * We have one less packet out there. 3387 */3388
3389 if (sk->packets_out > 0)
3390 sk->packets_out --;
3391 /* 3392 * Wake up the process, it can probably write more. 3393 */3394 if (!sk->dead)
3395 sk->write_space(sk);
3396 oskb = sk->send_head;
3397
3398 if (!(flag&2)) /* Not retransmitting */3399 {3400 longm;
3401
3402 /*3403 * The following amusing code comes from Jacobson's3404 * article in SIGCOMM '88. Note that rtt and mdev3405 * are scaled versions of rtt and mean deviation.3406 * This is designed to be as fast as possible 3407 * m stands for "measurement".3408 */3409
3410 m = jiffies - oskb->when; /* RTT */3411 if(m<=0)
3412 m=1; /* IS THIS RIGHT FOR <0 ??? */3413 m -= (sk->rtt >> 3); /* m is now error in rtt est */3414 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3415 if (m < 0)
3416 m = -m; /* m is now abs(error) */3417 m -= (sk->mdev >> 2); /* similar update on mdev */3418 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3419
3420 /*3421 * Now update timeout. Note that this removes any backoff.3422 */3423
3424 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3425 if (sk->rto > 120*HZ)
3426 sk->rto = 120*HZ;
3427 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3428 sk->rto = 20;
3429 sk->backoff = 0;
3430 }3431 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3432 In this case as we just set it up */3433 cli();
3434 oskb = sk->send_head;
3435 IS_SKB(oskb);
3436 sk->send_head = oskb->link3;
3437 if (sk->send_head == NULL)
3438 {3439 sk->send_tail = NULL;
3440 }3441
3442 /*3443 * We may need to remove this from the dev send list. 3444 */3445
3446 if (oskb->next)
3447 skb_unlink(oskb);
3448 sti();
3449 kfree_skb(oskb, FREE_WRITE); /* write. */3450 if (!sk->dead)
3451 sk->write_space(sk);
3452 }3453 else3454 {3455 break;
3456 }3457 }3458
3459 /*3460 * XXX someone ought to look at this too.. at the moment, if skb_peek()3461 * returns non-NULL, we complete ignore the timer stuff in the else3462 * clause. We ought to organize the code so that else clause can3463 * (should) be executed regardless, possibly moving the PROBE timer3464 * reset over. The skb_peek() thing should only move stuff to the3465 * write queue, NOT also manage the timer functions.3466 */3467
3468 /*3469 * Maybe we can take some stuff off of the write queue,3470 * and put it onto the xmit queue.3471 */3472 if (skb_peek(&sk->write_queue) != NULL)
3473 {3474 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3475 (sk->retransmits == 0 ||
3476 sk->ip_xmit_timeout != TIME_WRITE ||
3477 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3478 && sk->packets_out < sk->cong_window)
3479 {3480 /*3481 * Add more data to the send queue.3482 */3483 flag |= 1;
3484 tcp_write_xmit(sk);
3485 }3486 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3487 sk->send_head == NULL &&
3488 sk->ack_backlog == 0 &&
3489 sk->state != TCP_TIME_WAIT)
3490 {3491 /*3492 * Data to queue but no room.3493 */3494 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3495 }3496 }3497 else3498 {3499 /*3500 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3501 * from TCP_CLOSE we don't do anything3502 *3503 * from anything else, if there is write data (or fin) pending,3504 * we use a TIME_WRITE timeout, else if keepalive we reset to3505 * a KEEPALIVE timeout, else we delete the timer.3506 *3507 * We do not set flag for nominal write data, otherwise we may3508 * force a state where we start to write itsy bitsy tidbits3509 * of data.3510 */3511
3512 switch(sk->state) {3513 caseTCP_TIME_WAIT:
3514 /*3515 * keep us in TIME_WAIT until we stop getting packets,3516 * reset the timeout.3517 */3518 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3519 break;
3520 caseTCP_CLOSE:
3521 /*3522 * don't touch the timer.3523 */3524 break;
3525 default:
3526 /*3527 * Must check send_head, write_queue, and ack_backlog3528 * to determine which timeout to use.3529 */3530 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3531 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3532 }elseif (sk->keepopen) {3533 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3534 }else{3535 del_timer(&sk->retransmit_timer);
3536 sk->ip_xmit_timeout = 0;
3537 }3538 break;
3539 }3540 }3541
3542 /*3543 * We have nothing queued but space to send. Send any partial3544 * packets immediately (end of Nagle rule application).3545 */3546
3547 if (sk->packets_out == 0 && sk->partial != NULL &&
3548 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3549 {3550 flag |= 1;
3551 tcp_send_partial(sk);
3552 }3553
3554 /*3555 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3556 * we are now waiting for an acknowledge to our FIN. The other end is3557 * already in TIME_WAIT.3558 *3559 * Move to TCP_CLOSE on success.3560 */3561
3562 if (sk->state == TCP_LAST_ACK)
3563 {3564 if (!sk->dead)
3565 sk->state_change(sk);
3566 if(sk->debug)
3567 printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3568 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3569 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3570 {3571 flag |= 1;
3572 tcp_set_state(sk,TCP_CLOSE);
3573 sk->shutdown = SHUTDOWN_MASK;
3574 }3575 }3576
3577 /*3578 * Incoming ACK to a FIN we sent in the case of our initiating the close.3579 *3580 * Move to FIN_WAIT2 to await a FIN from the other end. Set3581 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3582 */3583
3584 if (sk->state == TCP_FIN_WAIT1)
3585 {3586
3587 if (!sk->dead)
3588 sk->state_change(sk);
3589 if (sk->rcv_ack_seq == sk->write_seq)
3590 {3591 flag |= 1;
3592 sk->shutdown |= SEND_SHUTDOWN;
3593 tcp_set_state(sk, TCP_FIN_WAIT2);
3594 }3595 }3596
3597 /*3598 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3599 *3600 * Move to TIME_WAIT3601 */3602
3603 if (sk->state == TCP_CLOSING)
3604 {3605
3606 if (!sk->dead)
3607 sk->state_change(sk);
3608 if (sk->rcv_ack_seq == sk->write_seq)
3609 {3610 flag |= 1;
3611 tcp_time_wait(sk);
3612 }3613 }3614
3615 /*3616 * Final ack of a three way shake 3617 */3618
3619 if(sk->state==TCP_SYN_RECV)
3620 {3621 tcp_set_state(sk, TCP_ESTABLISHED);
3622 tcp_options(sk,th);
3623 sk->dummy_th.dest=th->source;
3624 sk->copied_seq = sk->acked_seq;
3625 if(!sk->dead)
3626 sk->state_change(sk);
3627 if(sk->max_window==0)
3628 {3629 sk->max_window=32; /* Sanity check */3630 sk->mss=min(sk->max_window,sk->mtu);
3631 }3632 }3633
3634 /*3635 * I make no guarantees about the first clause in the following3636 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3637 * what conditions "!flag" would be true. However I think the rest3638 * of the conditions would prevent that from causing any3639 * unnecessary retransmission. 3640 * Clearly if the first packet has expired it should be 3641 * retransmitted. The other alternative, "flag&2 && retransmits", is3642 * harder to explain: You have to look carefully at how and when the3643 * timer is set and with what timeout. The most recent transmission always3644 * sets the timer. So in general if the most recent thing has timed3645 * out, everything before it has as well. So we want to go ahead and3646 * retransmit some more. If we didn't explicitly test for this3647 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3648 * would not be true. If you look at the pattern of timing, you can3649 * show that rto is increased fast enough that the next packet would3650 * almost never be retransmitted immediately. Then you'd end up3651 * waiting for a timeout to send each packet on the retransmission3652 * queue. With my implementation of the Karn sampling algorithm,3653 * the timeout would double each time. The net result is that it would3654 * take a hideous amount of time to recover from a single dropped packet.3655 * It's possible that there should also be a test for TIME_WRITE, but3656 * I think as long as "send_head != NULL" and "retransmit" is on, we've3657 * got to be in real retransmission mode.3658 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3659 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3660 * As long as no further losses occur, this seems reasonable.3661 */3662
3663 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3664 (((flag&2) && sk->retransmits) ||
3665 (sk->send_head->when + sk->rto < jiffies)))
3666 {3667 if(sk->send_head->when + sk->rto < jiffies)
3668 tcp_retransmit(sk,0);
3669 else3670 {3671 tcp_do_retransmit(sk, 1);
3672 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3673 }3674 }3675
3676 return(1);
3677 }3678
3679
3680 /*3681 * Process the FIN bit. This now behaves as it is supposed to work3682 * and the FIN takes effect when it is validly part of sequence3683 * space. Not before when we get holes.3684 *3685 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3686 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3687 * TIME-WAIT)3688 *3689 * If we are in FINWAIT-1, a received FIN indicates simultaneous3690 * close and we go into CLOSING (and later onto TIME-WAIT)3691 *3692 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3693 *3694 */3695
3696 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3697 {3698 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3699
3700 if (!sk->dead)
3701 {3702 sk->state_change(sk);
3703 sock_wake_async(sk->socket, 1);
3704 }3705
3706 switch(sk->state)
3707 {3708 caseTCP_SYN_RECV:
3709 caseTCP_SYN_SENT:
3710 caseTCP_ESTABLISHED:
3711 /*3712 * move to CLOSE_WAIT, tcp_data() already handled3713 * sending the ack.3714 */3715 tcp_set_state(sk,TCP_CLOSE_WAIT);
3716 if (th->rst)
3717 sk->shutdown = SHUTDOWN_MASK;
3718 break;
3719
3720 caseTCP_CLOSE_WAIT:
3721 caseTCP_CLOSING:
3722 /*3723 * received a retransmission of the FIN, do3724 * nothing.3725 */3726 break;
3727 caseTCP_TIME_WAIT:
3728 /*3729 * received a retransmission of the FIN,3730 * restart the TIME_WAIT timer.3731 */3732 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3733 return(0);
3734 caseTCP_FIN_WAIT1:
3735 /*3736 * This case occurs when a simultaneous close3737 * happens, we must ack the received FIN and3738 * enter the CLOSING state.3739 *3740 * This causes a WRITE timeout, which will either3741 * move on to TIME_WAIT when we timeout, or resend3742 * the FIN properly (maybe we get rid of that annoying3743 * FIN lost hang). The TIME_WRITE code is already correct3744 * for handling this timeout.3745 */3746
3747 if(sk->ip_xmit_timeout != TIME_WRITE)
3748 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3749 tcp_set_state(sk,TCP_CLOSING);
3750 break;
3751 caseTCP_FIN_WAIT2:
3752 /*3753 * received a FIN -- send ACK and enter TIME_WAIT3754 */3755 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3756 sk->shutdown|=SHUTDOWN_MASK;
3757 tcp_set_state(sk,TCP_TIME_WAIT);
3758 break;
3759 caseTCP_CLOSE:
3760 /*3761 * already in CLOSE3762 */3763 break;
3764 default:
3765 tcp_set_state(sk,TCP_LAST_ACK);
3766
3767 /* Start the timers. */3768 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3769 return(0);
3770 }3771
3772 return(0);
3773 }3774
3775
3776
3777 /*3778 * This routine handles the data. If there is room in the buffer,3779 * it will be have already been moved into it. If there is no3780 * room, then we will just have to discard the packet.3781 */3782
3783 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */3784 unsignedlongsaddr, unsignedshortlen)
3785 {3786 structsk_buff *skb1, *skb2;
3787 structtcphdr *th;
3788 intdup_dumped=0;
3789 unsignedlongnew_seq;
3790 unsignedlongshut_seq;
3791
3792 th = skb->h.th;
3793 skb->len = len -(th->doff*4);
3794
3795 /*3796 * The bytes in the receive read/assembly queue has increased. Needed for the3797 * low memory discard algorithm 3798 */3799
3800 sk->bytes_rcv += skb->len;
3801
3802 if (skb->len == 0 && !th->fin && !th->urg && !th->psh)
3803 {3804 /* 3805 * Don't want to keep passing ack's back and forth. 3806 * (someone sent us dataless, boring frame)3807 */3808 if (!th->ack)
3809 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3810 kfree_skb(skb, FREE_READ);
3811 return(0);
3812 }3813
3814 /*3815 * We no longer have anyone receiving data on this connection.3816 */3817
3818 #ifndef TCP_DONT_RST_SHUTDOWN
3819
3820 if(sk->shutdown & RCV_SHUTDOWN)
3821 {3822 /*3823 * FIXME: BSD has some magic to avoid sending resets to3824 * broken 4.2 BSD keepalives. Much to my surprise a few non3825 * BSD stacks still have broken keepalives so we want to3826 * cope with it.3827 */3828
3829 if(skb->len) /* We don't care if its just an ack or3830 a keepalive/window probe */3831 {3832 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */3833
3834 /* Do this the way 4.4BSD treats it. Not what I'd3835 regard as the meaning of the spec but its what BSD3836 does and clearly they know everything 8) */3837
3838 /*3839 * This is valid because of two things3840 *3841 * a) The way tcp_data behaves at the bottom.3842 * b) A fin takes effect when read not when received.3843 */3844
3845 shut_seq=sk->acked_seq+1; /* Last byte */3846
3847 if(after(new_seq,shut_seq))
3848 {3849 if(sk->debug)
3850 printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3851 sk, new_seq, shut_seq, sk->blog);
3852 if(sk->dead)
3853 {3854 sk->acked_seq = new_seq + th->fin;
3855 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3856 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3857 tcp_statistics.TcpEstabResets++;
3858 tcp_set_state(sk,TCP_CLOSE);
3859 sk->err = EPIPE;
3860 sk->shutdown = SHUTDOWN_MASK;
3861 kfree_skb(skb, FREE_READ);
3862 return 0;
3863 }3864 }3865 }3866 }3867
3868 #endif3869
3870 /*3871 * Now we have to walk the chain, and figure out where this one3872 * goes into it. This is set up so that the last packet we received3873 * will be the first one we look at, that way if everything comes3874 * in order, there will be no performance loss, and if they come3875 * out of order we will be able to fit things in nicely.3876 *3877 * [AC: This is wrong. We should assume in order first and then walk3878 * forwards from the first hole based upon real traffic patterns.]3879 * 3880 */3881
3882 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */3883 {3884 skb_queue_head(&sk->receive_queue,skb);
3885 skb1= NULL;
3886 }3887 else3888 {3889 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3890 {3891 if(sk->debug)
3892 {3893 printk("skb1=%p :", skb1);
3894 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3895 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3896 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3897 sk->acked_seq);
3898 }3899
3900 /*3901 * Optimisation: Duplicate frame or extension of previous frame from3902 * same sequence point (lost ack case).3903 * The frame contains duplicate data or replaces a previous frame3904 * discard the previous frame (safe as sk->inuse is set) and put3905 * the new one in its place.3906 */3907
3908 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3909 {3910 skb_append(skb1,skb);
3911 skb_unlink(skb1);
3912 kfree_skb(skb1,FREE_READ);
3913 dup_dumped=1;
3914 skb1=NULL;
3915 break;
3916 }3917
3918 /*3919 * Found where it fits3920 */3921
3922 if (after(th->seq+1, skb1->h.th->seq))
3923 {3924 skb_append(skb1,skb);
3925 break;
3926 }3927
3928 /*3929 * See if we've hit the start. If so insert.3930 */3931 if (skb1 == skb_peek(&sk->receive_queue))
3932 {3933 skb_queue_head(&sk->receive_queue, skb);
3934 break;
3935 }3936 }3937 }3938
3939 /*3940 * Figure out what the ack value for this frame is3941 */3942
3943 th->ack_seq = th->seq + skb->len;
3944 if (th->syn)
3945 th->ack_seq++;
3946 if (th->fin)
3947 th->ack_seq++;
3948
3949 if (before(sk->acked_seq, sk->copied_seq))
3950 {3951 printk("*** tcp.c:tcp_data bug acked < copied\n");
3952 sk->acked_seq = sk->copied_seq;
3953 }3954
3955 /*3956 * Now figure out if we can ack anything. This is very messy because we really want two3957 * receive queues, a completed and an assembly queue. We also want only one transmit3958 * queue.3959 */3960
3961 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3962 {3963 if (before(th->seq, sk->acked_seq+1))
3964 {3965 intnewwindow;
3966
3967 if (after(th->ack_seq, sk->acked_seq))
3968 {3969 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3970 if (newwindow < 0)
3971 newwindow = 0;
3972 sk->window = newwindow;
3973 sk->acked_seq = th->ack_seq;
3974 }3975 skb->acked = 1;
3976
3977 /*3978 * When we ack the fin, we do the FIN 3979 * processing.3980 */3981
3982 if (skb->h.th->fin)
3983 {3984 tcp_fin(skb,sk,skb->h.th);
3985 }3986
3987 for(skb2 = skb->next;
3988 skb2 != (structsk_buff *)&sk->receive_queue;
3989 skb2 = skb2->next)
3990 {3991 if (before(skb2->h.th->seq, sk->acked_seq+1))
3992 {3993 if (after(skb2->h.th->ack_seq, sk->acked_seq))
3994 {3995 newwindow = sk->window -
3996 (skb2->h.th->ack_seq - sk->acked_seq);
3997 if (newwindow < 0)
3998 newwindow = 0;
3999 sk->window = newwindow;
4000 sk->acked_seq = skb2->h.th->ack_seq;
4001 }4002 skb2->acked = 1;
4003 /*4004 * When we ack the fin, we do4005 * the fin handling.4006 */4007 if (skb2->h.th->fin)
4008 {4009 tcp_fin(skb,sk,skb->h.th);
4010 }4011
4012 /*4013 * Force an immediate ack.4014 */4015
4016 sk->ack_backlog = sk->max_ack_backlog;
4017 }4018 else4019 {4020 break;
4021 }4022 }4023
4024 /*4025 * This also takes care of updating the window.4026 * This if statement needs to be simplified.4027 */4028 if (!sk->delay_acks ||
4029 sk->ack_backlog >= sk->max_ack_backlog ||
4030 sk->bytes_rcv > sk->max_unacked || th->fin) {4031 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4032 }4033 else4034 {4035 sk->ack_backlog++;
4036 if(sk->debug)
4037 printk("Ack queued.\n");
4038 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4039 }4040 }4041 }4042
4043 /*4044 * If we've missed a packet, send an ack.4045 * Also start a timer to send another.4046 */4047
4048 if (!skb->acked)
4049 {4050
4051 /*4052 * This is important. If we don't have much room left,4053 * we need to throw out a few packets so we have a good4054 * window. Note that mtu is used, not mss, because mss is really4055 * for the send side. He could be sending us stuff as large as mtu.4056 */4057
4058 while (sk->prot->rspace(sk) < sk->mtu)
4059 {4060 skb1 = skb_peek(&sk->receive_queue);
4061 if (skb1 == NULL)
4062 {4063 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4064 break;
4065 }4066
4067 /*4068 * Don't throw out something that has been acked. 4069 */4070
4071 if (skb1->acked)
4072 {4073 break;
4074 }4075
4076 skb_unlink(skb1);
4077 kfree_skb(skb1, FREE_READ);
4078 }4079 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4080 sk->ack_backlog++;
4081 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4082 }4083 else4084 {4085 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4086 }4087
4088 /*4089 * Now tell the user we may have some data. 4090 */4091
4092 if (!sk->dead)
4093 {4094 if(sk->debug)
4095 printk("Data wakeup.\n");
4096 sk->data_ready(sk,0);
4097 }4098 return(0);
4099 }4100
4101
4102 /*4103 * This routine is only called when we have urgent data4104 * signalled. Its the 'slow' part of tcp_urg. It could be4105 * moved inline now as tcp_urg is only called from one4106 * place. We handle URGent data wrong. We have to - as4107 * BSD still doesn't use the correction from RFC961.4108 */4109
4110 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4111 {4112 unsignedlongptr = ntohs(th->urg_ptr);
4113
4114 if (ptr)
4115 ptr--;
4116 ptr += th->seq;
4117
4118 /* ignore urgent data that we've already seen and read */4119 if (after(sk->copied_seq, ptr))
4120 return;
4121
4122 /* do we already have a newer (or duplicate) urgent pointer? */4123 if (sk->urg_data && !after(ptr, sk->urg_seq))
4124 return;
4125
4126 /* tell the world about our new urgent pointer */4127 if (sk->proc != 0) {4128 if (sk->proc > 0) {4129 kill_proc(sk->proc, SIGURG, 1);
4130 }else{4131 kill_pg(-sk->proc, SIGURG, 1);
4132 }4133 }4134 sk->urg_data = URG_NOTYET;
4135 sk->urg_seq = ptr;
4136 }4137
4138 /*4139 * This is the 'fast' part of urgent handling.4140 */4141
4142 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4143 unsignedlongsaddr, unsignedlonglen)
4144 {4145 unsignedlongptr;
4146
4147 /*4148 * Check if we get a new urgent pointer - normally not 4149 */4150
4151 if (th->urg)
4152 tcp_check_urg(sk,th);
4153
4154 /*4155 * Do we wait for any urgent data? - normally not4156 */4157
4158 if (sk->urg_data != URG_NOTYET)
4159 return 0;
4160
4161 /*4162 * Is the urgent pointer pointing into this packet? 4163 */4164
4165 ptr = sk->urg_seq - th->seq + th->doff*4;
4166 if (ptr >= len)
4167 return 0;
4168
4169 /*4170 * Ok, got the correct packet, update info 4171 */4172
4173 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4174 if (!sk->dead)
4175 sk->data_ready(sk,0);
4176 return 0;
4177 }4178
4179 /*4180 * This will accept the next outstanding connection. 4181 */4182
4183 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4184 {4185 structsock *newsk;
4186 structsk_buff *skb;
4187
4188 /*4189 * We need to make sure that this socket is listening,4190 * and that it has something pending.4191 */4192
4193 if (sk->state != TCP_LISTEN)
4194 {4195 sk->err = EINVAL;
4196 return(NULL);
4197 }4198
4199 /* Avoid the race. */4200 cli();
4201 sk->inuse = 1;
4202
4203 while((skb = tcp_dequeue_established(sk)) == NULL)
4204 {4205 if (flags & O_NONBLOCK)
4206 {4207 sti();
4208 release_sock(sk);
4209 sk->err = EAGAIN;
4210 return(NULL);
4211 }4212
4213 release_sock(sk);
4214 interruptible_sleep_on(sk->sleep);
4215 if (current->signal & ~current->blocked)
4216 {4217 sti();
4218 sk->err = ERESTARTSYS;
4219 return(NULL);
4220 }4221 sk->inuse = 1;
4222 }4223 sti();
4224
4225 /*4226 * Now all we need to do is return skb->sk. 4227 */4228
4229 newsk = skb->sk;
4230
4231 kfree_skb(skb, FREE_READ);
4232 sk->ack_backlog--;
4233 release_sock(sk);
4234 return(newsk);
4235 }4236
4237
4238 /*4239 * This will initiate an outgoing connection. 4240 */4241
4242 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4243 {4244 structsk_buff *buff;
4245 structdevice *dev=NULL;
4246 unsignedchar *ptr;
4247 inttmp;
4248 intatype;
4249 structtcphdr *t1;
4250 structrtable *rt;
4251
4252 if (sk->state != TCP_CLOSE)
4253 {4254 return(-EISCONN);
4255 }4256
4257 if (addr_len < 8)
4258 return(-EINVAL);
4259
4260 if (usin->sin_family && usin->sin_family != AF_INET)
4261 return(-EAFNOSUPPORT);
4262
4263 /*4264 * connect() to INADDR_ANY means loopback (BSD'ism).4265 */4266
4267 if(usin->sin_addr.s_addr==INADDR_ANY)
4268 usin->sin_addr.s_addr=ip_my_addr();
4269
4270 /*4271 * Don't want a TCP connection going to a broadcast address 4272 */4273
4274 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4275 return -ENETUNREACH;
4276
4277 sk->inuse = 1;
4278 sk->daddr = usin->sin_addr.s_addr;
4279 sk->write_seq = jiffies * SEQ_TICK - seq_offset;
4280 sk->window_seq = sk->write_seq;
4281 sk->rcv_ack_seq = sk->write_seq -1;
4282 sk->err = 0;
4283 sk->dummy_th.dest = usin->sin_port;
4284 release_sock(sk);
4285
4286 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4287 if (buff == NULL)
4288 {4289 return(-ENOMEM);
4290 }4291 sk->inuse = 1;
4292 buff->len = 24;
4293 buff->sk = sk;
4294 buff->free = 0;
4295 buff->localroute = sk->localroute;
4296
4297 t1 = (structtcphdr *) buff->data;
4298
4299 /*4300 * Put in the IP header and routing stuff. 4301 */4302
4303 rt=ip_rt_route(sk->daddr, NULL, NULL);
4304
4305
4306 /*4307 * We need to build the routing stuff from the things saved in skb. 4308 */4309
4310 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4311 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4312 if (tmp < 0)
4313 {4314 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4315 release_sock(sk);
4316 return(-ENETUNREACH);
4317 }4318
4319 buff->len += tmp;
4320 t1 = (structtcphdr *)((char *)t1 +tmp);
4321
4322 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4323 t1->seq = ntohl(sk->write_seq++);
4324 sk->sent_seq = sk->write_seq;
4325 buff->h.seq = sk->write_seq;
4326 t1->ack = 0;
4327 t1->window = 2;
4328 t1->res1=0;
4329 t1->res2=0;
4330 t1->rst = 0;
4331 t1->urg = 0;
4332 t1->psh = 0;
4333 t1->syn = 1;
4334 t1->urg_ptr = 0;
4335 t1->doff = 6;
4336 /* use 512 or whatever user asked for */4337
4338 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4339 sk->window_clamp=rt->rt_window;
4340 else4341 sk->window_clamp=0;
4342
4343 if (sk->user_mss)
4344 sk->mtu = sk->user_mss;
4345 elseif(rt!=NULL && (rt->rt_flags&RTF_MTU))
4346 sk->mtu = rt->rt_mss;
4347 else4348 {4349 #ifdefCONFIG_INET_SNARL4350 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4351 #else4352 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4353 #endif4354 sk->mtu = 576 - HEADER_SIZE;
4355 else4356 sk->mtu = MAX_WINDOW;
4357 }4358 /*4359 * but not bigger than device MTU 4360 */4361
4362 if(sk->mtu <32)
4363 sk->mtu = 32; /* Sanity limit */4364
4365 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4366
4367 /*4368 * Put in the TCP options to say MTU. 4369 */4370
4371 ptr = (unsignedchar *)(t1+1);
4372 ptr[0] = 2;
4373 ptr[1] = 4;
4374 ptr[2] = (sk->mtu) >> 8;
4375 ptr[3] = (sk->mtu) & 0xff;
4376 tcp_send_check(t1, sk->saddr, sk->daddr,
4377 sizeof(structtcphdr) + 4, sk);
4378
4379 /*4380 * This must go first otherwise a really quick response will get reset. 4381 */4382
4383 tcp_set_state(sk,TCP_SYN_SENT);
4384 sk->rto = TCP_TIMEOUT_INIT;
4385 init_timer(&sk->retransmit_timer);
4386 sk->retransmit_timer.function=&retransmit_timer;
4387 sk->retransmit_timer.data = (unsignedlong)sk;
4388 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4389 sk->retransmits = TCP_SYN_RETRIES;
4390
4391 sk->prot->queue_xmit(sk, dev, buff, 0);
4392 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4393 tcp_statistics.TcpActiveOpens++;
4394 tcp_statistics.TcpOutSegs++;
4395
4396 release_sock(sk);
4397 return(0);
4398 }4399
4400
4401 /* This functions checks to see if the tcp header is actually acceptable. */4402 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4403 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4404 {4405 unsignedlongnext_seq;
4406
4407 next_seq = len - 4*th->doff;
4408 if (th->fin)
4409 next_seq++;
4410 /* if we have a zero window, we can't have any data in the packet.. */4411 if (next_seq && !sk->window)
4412 gotoignore_it;
4413 next_seq += th->seq;
4414
4415 /*4416 * This isn't quite right. sk->acked_seq could be more recent4417 * than sk->window. This is however close enough. We will accept4418 * slightly more packets than we should, but it should not cause4419 * problems unless someone is trying to forge packets.4420 */4421
4422 /* have we already seen all of this packet? */4423 if (!after(next_seq+1, sk->acked_seq))
4424 gotoignore_it;
4425 /* or does it start beyond the window? */4426 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4427 gotoignore_it;
4428
4429 /* ok, at least part of this packet would seem interesting.. */4430 return 1;
4431
4432 ignore_it:
4433 if (th->rst)
4434 return 0;
4435
4436 /*4437 * Send a reset if we get something not ours and we are4438 * unsynchronized. Note: We don't do anything to our end. We4439 * are just killing the bogus remote connection then we will4440 * connect again and it will work (with luck).4441 */4442
4443 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4444 {4445 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4446 return 1;
4447 }4448
4449 /* Try to resync things. */4450 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4451 return 0;
4452 }4453
4454 /*4455 * When we get a reset we do this.4456 */4457
4458 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4459 {4460 sk->zapped = 1;
4461 sk->err = ECONNRESET;
4462 if (sk->state == TCP_SYN_SENT)
4463 sk->err = ECONNREFUSED;
4464 if (sk->state == TCP_CLOSE_WAIT)
4465 sk->err = EPIPE;
4466 #ifdef TCP_DO_RFC1337
4467 /*4468 * Time wait assassination protection [RFC1337]4469 */4470 if(sk->state!=TCP_TIME_WAIT)
4471 {4472 tcp_set_state(sk,TCP_CLOSE);
4473 sk->shutdown = SHUTDOWN_MASK;
4474 }4475 #else4476 tcp_set_state(sk,TCP_CLOSE);
4477 sk->shutdown = SHUTDOWN_MASK;
4478 #endif4479 if (!sk->dead)
4480 sk->state_change(sk);
4481 kfree_skb(skb, FREE_READ);
4482 release_sock(sk);
4483 return(0);
4484 }4485
4486 /*4487 * A TCP packet has arrived.4488 */4489
4490 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4491 unsignedlongdaddr, unsignedshortlen,
4492 unsignedlongsaddr, intredo, structinet_protocol * protocol)
4493 {4494 structtcphdr *th;
4495 structsock *sk;
4496 intsyn_ok=0;
4497
4498 if (!skb)
4499 {4500 printk("IMPOSSIBLE 1\n");
4501 return(0);
4502 }4503
4504 if (!dev)
4505 {4506 printk("IMPOSSIBLE 2\n");
4507 return(0);
4508 }4509
4510 tcp_statistics.TcpInSegs++;
4511
4512 if(skb->pkt_type!=PACKET_HOST)
4513 {4514 kfree_skb(skb,FREE_READ);
4515 return(0);
4516 }4517
4518 th = skb->h.th;
4519
4520 /*4521 * Find the socket.4522 */4523
4524 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4525
4526 /*4527 * If this socket has got a reset its to all intents and purposes 4528 * really dead. Count closed sockets as dead.4529 *4530 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4531 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4532 * exist so should cause resets as if the port was unreachable.4533 */4534
4535 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4536 sk=NULL;
4537
4538 if (!redo)
4539 {4540 if (tcp_check(th, len, saddr, daddr ))
4541 {4542 skb->sk = NULL;
4543 kfree_skb(skb,FREE_READ);
4544 /*4545 * We don't release the socket because it was4546 * never marked in use.4547 */4548 return(0);
4549 }4550 th->seq = ntohl(th->seq);
4551
4552 /* See if we know about the socket. */4553 if (sk == NULL)
4554 {4555 /*4556 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4557 */4558 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4559 skb->sk = NULL;
4560 /*4561 * Discard frame4562 */4563 kfree_skb(skb, FREE_READ);
4564 return(0);
4565 }4566
4567 skb->len = len;
4568 skb->acked = 0;
4569 skb->used = 0;
4570 skb->free = 0;
4571 skb->saddr = daddr;
4572 skb->daddr = saddr;
4573
4574 /* We may need to add it to the backlog here. */4575 cli();
4576 if (sk->inuse)
4577 {4578 skb_queue_tail(&sk->back_log, skb);
4579 sti();
4580 return(0);
4581 }4582 sk->inuse = 1;
4583 sti();
4584 }4585 else4586 {4587 if (sk==NULL)
4588 {4589 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4590 skb->sk = NULL;
4591 kfree_skb(skb, FREE_READ);
4592 return(0);
4593 }4594 }4595
4596
4597 if (!sk->prot)
4598 {4599 printk("IMPOSSIBLE 3\n");
4600 return(0);
4601 }4602
4603
4604 /*4605 * Charge the memory to the socket. 4606 */4607
4608 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
4609 {4610 kfree_skb(skb, FREE_READ);
4611 release_sock(sk);
4612 return(0);
4613 }4614
4615 skb->sk=sk;
4616 sk->rmem_alloc += skb->mem_len;
4617
4618 /*4619 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4620 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4621 * compatibility. We also set up variables more thoroughly [Karn notes in the4622 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4623 */4624
4625 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4626 {4627
4628 /*4629 * Now deal with unusual cases.4630 */4631
4632 if(sk->state==TCP_LISTEN)
4633 {4634 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4635 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4636
4637 /*4638 * We don't care for RST, and non SYN are absorbed (old segments)4639 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4640 * netmask on a running connection it can go broadcast. Even Sun's have4641 * this problem so I'm ignoring it 4642 */4643
4644 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4645 {4646 kfree_skb(skb, FREE_READ);
4647 release_sock(sk);
4648 return 0;
4649 }4650
4651 /* 4652 * Guess we need to make a new socket up 4653 */4654
4655 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4656
4657 /*4658 * Now we have several options: In theory there is nothing else4659 * in the frame. KA9Q has an option to send data with the syn,4660 * BSD accepts data with the syn up to the [to be] advertised window4661 * and Solaris 2.1 gives you a protocol error. For now we just ignore4662 * it, that fits the spec precisely and avoids incompatibilities. It4663 * would be nice in future to drop through and process the data.4664 */4665
4666 release_sock(sk);
4667 return 0;
4668 }4669
4670 /* retransmitted SYN? */4671 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4672 {4673 kfree_skb(skb, FREE_READ);
4674 release_sock(sk);
4675 return 0;
4676 }4677
4678 /*4679 * SYN sent means we have to look for a suitable ack and either reset4680 * for bad matches or go to connected 4681 */4682
4683 if(sk->state==TCP_SYN_SENT)
4684 {4685 /* Crossed SYN or previous junk segment */4686 if(th->ack)
4687 {4688 /* We got an ack, but its not a good ack */4689 if(!tcp_ack(sk,th,saddr,len))
4690 {4691 /* Reset the ack - its an ack from a 4692 different connection [ th->rst is checked in tcp_reset()] */4693 tcp_statistics.TcpAttemptFails++;
4694 tcp_reset(daddr, saddr, th,
4695 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4696 kfree_skb(skb, FREE_READ);
4697 release_sock(sk);
4698 return(0);
4699 }4700 if(th->rst)
4701 returntcp_std_reset(sk,skb);
4702 if(!th->syn)
4703 {4704 /* A valid ack from a different connection4705 start. Shouldn't happen but cover it */4706 kfree_skb(skb, FREE_READ);
4707 release_sock(sk);
4708 return 0;
4709 }4710 /*4711 * Ok.. its good. Set up sequence numbers and4712 * move to established.4713 */4714 syn_ok=1; /* Don't reset this connection for the syn */4715 sk->acked_seq=th->seq+1;
4716 sk->fin_seq=th->seq;
4717 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4718 tcp_set_state(sk, TCP_ESTABLISHED);
4719 tcp_options(sk,th);
4720 sk->dummy_th.dest=th->source;
4721 sk->copied_seq = sk->acked_seq;
4722 if(!sk->dead)
4723 {4724 sk->state_change(sk);
4725 sock_wake_async(sk->socket, 0);
4726 }4727 if(sk->max_window==0)
4728 {4729 sk->max_window = 32;
4730 sk->mss = min(sk->max_window, sk->mtu);
4731 }4732 }4733 else4734 {4735 /* See if SYN's cross. Drop if boring */4736 if(th->syn && !th->rst)
4737 {4738 /* Crossed SYN's are fine - but talking to4739 yourself is right out... */4740 if(sk->saddr==saddr && sk->daddr==daddr &&
4741 sk->dummy_th.source==th->source &&
4742 sk->dummy_th.dest==th->dest)
4743 {4744 tcp_statistics.TcpAttemptFails++;
4745 returntcp_std_reset(sk,skb);
4746 }4747 tcp_set_state(sk,TCP_SYN_RECV);
4748
4749 /*4750 * FIXME:4751 * Must send SYN|ACK here4752 */4753 }4754 /* Discard junk segment */4755 kfree_skb(skb, FREE_READ);
4756 release_sock(sk);
4757 return 0;
4758 }4759 /*4760 * SYN_RECV with data maybe.. drop through4761 */4762 gotorfc_step6;
4763 }4764
4765 /*4766 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is4767 * a more complex suggestion for fixing these reuse issues in RFC16444768 * but not yet ready for general use. Also see RFC1379.4769 */4770
4771 #defineBSD_TIME_WAIT4772 #ifdefBSD_TIME_WAIT4773 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4774 after(th->seq, sk->acked_seq) && !th->rst)
4775 {4776 longseq=sk->write_seq;
4777 if(sk->debug)
4778 printk("Doing a BSD time wait\n");
4779 tcp_statistics.TcpEstabResets++;
4780 sk->rmem_alloc -= skb->mem_len;
4781 skb->sk = NULL;
4782 sk->err=ECONNRESET;
4783 tcp_set_state(sk, TCP_CLOSE);
4784 sk->shutdown = SHUTDOWN_MASK;
4785 release_sock(sk);
4786 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4787 if (sk && sk->state==TCP_LISTEN)
4788 {4789 sk->inuse=1;
4790 skb->sk = sk;
4791 sk->rmem_alloc += skb->mem_len;
4792 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4793 release_sock(sk);
4794 return 0;
4795 }4796 kfree_skb(skb, FREE_READ);
4797 return 0;
4798 }4799 #endif4800 }4801
4802 /*4803 * We are now in normal data flow (see the step list in the RFC)4804 * Note most of these are inline now. I'll inline the lot when4805 * I have time to test it hard and look at what gcc outputs 4806 */4807
4808 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4809 {4810 kfree_skb(skb, FREE_READ);
4811 release_sock(sk);
4812 return 0;
4813 }4814
4815 if(th->rst)
4816 returntcp_std_reset(sk,skb);
4817
4818 /*4819 * !syn_ok is effectively the state test in RFC793.4820 */4821
4822 if(th->syn && !syn_ok)
4823 {4824 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4825 returntcp_std_reset(sk,skb);
4826 }4827
4828 /*4829 * Process the ACK4830 */4831
4832
4833 if(th->ack && !tcp_ack(sk,th,saddr,len))
4834 {4835 /*4836 * Our three way handshake failed.4837 */4838
4839 if(sk->state==TCP_SYN_RECV)
4840 {4841 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4842 }4843 kfree_skb(skb, FREE_READ);
4844 release_sock(sk);
4845 return 0;
4846 }4847
4848 rfc_step6: /* I'll clean this up later */4849
4850 /*4851 * Process urgent data4852 */4853
4854 if(tcp_urg(sk, th, saddr, len))
4855 {4856 kfree_skb(skb, FREE_READ);
4857 release_sock(sk);
4858 return 0;
4859 }4860
4861
4862 /*4863 * Process the encapsulated data4864 */4865
4866 if(tcp_data(skb,sk, saddr, len))
4867 {4868 kfree_skb(skb, FREE_READ);
4869 release_sock(sk);
4870 return 0;
4871 }4872
4873 /*4874 * And done4875 */4876
4877 release_sock(sk);
4878 return 0;
4879 }4880
4881 /*4882 * This routine sends a packet with an out of date sequence4883 * number. It assumes the other end will try to ack it.4884 */4885
4886 staticvoidtcp_write_wakeup(structsock *sk)
/* */4887 {4888 structsk_buff *buff;
4889 structtcphdr *t1;
4890 structdevice *dev=NULL;
4891 inttmp;
4892
4893 if (sk->zapped)
4894 return; /* After a valid reset we can send no more */4895
4896 /*4897 * Write data can still be transmitted/retransmitted in the4898 * following states. If any other state is encountered, return.4899 * [listen/close will never occur here anyway]4900 */4901
4902 if (sk->state != TCP_ESTABLISHED &&
4903 sk->state != TCP_CLOSE_WAIT &&
4904 sk->state != TCP_FIN_WAIT1 &&
4905 sk->state != TCP_LAST_ACK &&
4906 sk->state != TCP_CLOSING4907 )
4908 {4909 return;
4910 }4911
4912 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4913 if (buff == NULL)
4914 return;
4915
4916 buff->len = sizeof(structtcphdr);
4917 buff->free = 1;
4918 buff->sk = sk;
4919 buff->localroute = sk->localroute;
4920
4921 t1 = (structtcphdr *) buff->data;
4922
4923 /* Put in the IP header and routing stuff. */4924 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4925 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4926 if (tmp < 0)
4927 {4928 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4929 return;
4930 }4931
4932 buff->len += tmp;
4933 t1 = (structtcphdr *)((char *)t1 +tmp);
4934
4935 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4936
4937 /*4938 * Use a previous sequence.4939 * This should cause the other end to send an ack.4940 */4941
4942 t1->seq = htonl(sk->sent_seq-1);
4943 t1->ack = 1;
4944 t1->res1= 0;
4945 t1->res2= 0;
4946 t1->rst = 0;
4947 t1->urg = 0;
4948 t1->psh = 0;
4949 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */4950 t1->syn = 0;
4951 t1->ack_seq = ntohl(sk->acked_seq);
4952 t1->window = ntohs(tcp_select_window(sk));
4953 t1->doff = sizeof(*t1)/4;
4954 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4955 /*4956 * Send it and free it.4957 * This will prevent the timer from automatically being restarted.4958 */4959 sk->prot->queue_xmit(sk, dev, buff, 1);
4960 tcp_statistics.TcpOutSegs++;
4961 }4962
4963 /*4964 * A window probe timeout has occurred.4965 */4966
4967 voidtcp_send_probe0(structsock *sk)
/* */4968 {4969 if (sk->zapped)
4970 return; /* After a valid reset we can send no more */4971
4972 tcp_write_wakeup(sk);
4973
4974 sk->backoff++;
4975 sk->rto = min(sk->rto << 1, 120*HZ);
4976 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4977 sk->retransmits++;
4978 sk->prot->retransmits ++;
4979 }4980
4981 /*4982 * Socket option code for TCP. 4983 */4984
4985 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */4986 {4987 intval,err;
4988
4989 if(level!=SOL_TCP)
4990 returnip_setsockopt(sk,level,optname,optval,optlen);
4991
4992 if (optval == NULL)
4993 return(-EINVAL);
4994
4995 err=verify_area(VERIFY_READ, optval, sizeof(int));
4996 if(err)
4997 returnerr;
4998
4999 val = get_fs_long((unsignedlong *)optval);
5000
5001 switch(optname)
5002 {5003 caseTCP_MAXSEG:
5004 /*5005 * values greater than interface MTU won't take effect. however at5006 * the point when this call is done we typically don't yet know5007 * which interface is going to be used5008 */5009 if(val<1||val>MAX_WINDOW)
5010 return -EINVAL;
5011 sk->user_mss=val;
5012 return 0;
5013 caseTCP_NODELAY:
5014 sk->nonagle=(val==0)?0:1;
5015 return 0;
5016 default:
5017 return(-ENOPROTOOPT);
5018 }5019 }5020
5021 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5022 {5023 intval,err;
5024
5025 if(level!=SOL_TCP)
5026 returnip_getsockopt(sk,level,optname,optval,optlen);
5027
5028 switch(optname)
5029 {5030 caseTCP_MAXSEG:
5031 val=sk->user_mss;
5032 break;
5033 caseTCP_NODELAY:
5034 val=sk->nonagle;
5035 break;
5036 default:
5037 return(-ENOPROTOOPT);
5038 }5039 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5040 if(err)
5041 returnerr;
5042 put_fs_long(sizeof(int),(unsignedlong *) optlen);
5043
5044 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5045 if(err)
5046 returnerr;
5047 put_fs_long(val,(unsignedlong *)optval);
5048
5049 return(0);
5050 }5051
5052
5053 structprototcp_prot = {5054 sock_wmalloc,
5055 sock_rmalloc,
5056 sock_wfree,
5057 sock_rfree,
5058 sock_rspace,
5059 sock_wspace,
5060 tcp_close,
5061 tcp_read,
5062 tcp_write,
5063 tcp_sendto,
5064 tcp_recvfrom,
5065 ip_build_header,
5066 tcp_connect,
5067 tcp_accept,
5068 ip_queue_xmit,
5069 tcp_retransmit,
5070 tcp_write_wakeup,
5071 tcp_read_wakeup,
5072 tcp_rcv,
5073 tcp_select,
5074 tcp_ioctl,
5075 NULL,
5076 tcp_shutdown,
5077 tcp_setsockopt,
5078 tcp_getsockopt,
5079 128,
5080 0,
5081 {NULL,},
5082 "TCP",
5083 0, 0
5084 };