1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@no.unit.nvg> 20 * 21 * Fixes: 22 * Alan Cox : Numerous verify_area() calls 23 * Alan Cox : Set the ACK bit on a reset 24 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 25 * and was trying to connect (tcp_err()). 26 * Alan Cox : All icmp error handling was broken 27 * pointers passed where wrong and the 28 * socket was looked up backwards. Nobody 29 * tested any icmp error code obviously. 30 * Alan Cox : tcp_err() now handled properly. It wakes people 31 * on errors. select behaves and the icmp error race 32 * has gone by moving it into sock.c 33 * Alan Cox : tcp_reset() fixed to work for everything not just 34 * packets for unknown sockets. 35 * Alan Cox : tcp option processing. 36 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] 37 * Herp Rosmanith : More reset fixes 38 * Alan Cox : No longer acks invalid rst frames. Acking 39 * any kind of RST is right out. 40 * Alan Cox : Sets an ignore me flag on an rst receive 41 * otherwise odd bits of prattle escape still 42 * Alan Cox : Fixed another acking RST frame bug. Should stop 43 * LAN workplace lockups. 44 * Alan Cox : Some tidyups using the new skb list facilities 45 * Alan Cox : sk->keepopen now seems to work 46 * Alan Cox : Pulls options out correctly on accepts 47 * Alan Cox : Fixed assorted sk->rqueue->next errors 48 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. 49 * Alan Cox : Tidied tcp_data to avoid a potential nasty. 50 * Alan Cox : Added some better commenting, as the tcp is hard to follow 51 * Alan Cox : Removed incorrect check for 20 * psh 52 * Michael O'Reilly : ack < copied bug fix. 53 * Johannes Stille : Misc tcp fixes (not all in yet). 54 * Alan Cox : FIN with no memory -> CRASH 55 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. 56 * Alan Cox : Added TCP options (SOL_TCP) 57 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. 58 * Alan Cox : Use ip_tos/ip_ttl settings. 59 * Alan Cox : Handle FIN (more) properly (we hope). 60 * Alan Cox : RST frames sent on unsynchronised state ack error/ 61 * Alan Cox : Put in missing check for SYN bit. 62 * Alan Cox : Added tcp_select_window() aka NET2E 63 * window non shrink trick. 64 * Alan Cox : Added a couple of small NET2E timer fixes 65 * Charles Hedrick : TCP fixes 66 * Toomas Tamm : TCP window fixes 67 * Alan Cox : Small URG fix to rlogin ^C ack fight 68 * Charles Hedrick : Rewrote most of it to actually work 69 * Linus : Rewrote tcp_read() and URG handling 70 * completely 71 * Gerhard Koerting: Fixed some missing timer handling 72 * Matthew Dillon : Reworked TCP machine states as per RFC 73 * Gerhard Koerting: PC/TCP workarounds 74 * Adam Caldwell : Assorted timer/timing errors 75 * Matthew Dillon : Fixed another RST bug 76 * Alan Cox : Move to kernel side addressing changes. 77 * Alan Cox : Beginning work on TCP fastpathing (not yet usable) 78 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 79 * Alan Cox : TCP fast path debugging 80 * Alan Cox : Window clamping 81 * Michael Riepe : Bug in tcp_check() 82 * Matt Dillon : More TCP improvements and RST bug fixes 83 * Matt Dillon : Yet more small nasties remove from the TCP code 84 * (Be very nice to this man if tcp finally works 100%) 8) 85 * Alan Cox : BSD accept semantics. 86 * Alan Cox : Reset on closedown bug. 87 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 88 * Michael Pall : Handle select() after URG properly in all cases. 89 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). 90 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. 91 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. 92 * Alan Cox : Changed the semantics of sk->socket to 93 * fix a race and a signal problem with 94 * accept() and async I/O. 95 * Alan Cox : Relaxed the rules on tcp_sendto(). 96 * Yury Shevchuk : Really fixed accept() blocking problem. 97 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 98 * clients/servers which listen in on 99 * fixed ports. 100 * Alan Cox : Cleaned the above up and shrank it to 101 * a sensible code size. 102 * Alan Cox : Self connect lockup fix. 103 * Alan Cox : No connect to multicast. 104 * Ross Biro : Close unaccepted children on master 105 * socket close. 106 * Alan Cox : Reset tracing code. 107 * Alan Cox : Spurious resets on shutdown. 108 * Alan Cox : Giant 15 minute/60 second timer error 109 * Alan Cox : Small whoops in selecting before an accept. 110 * Alan Cox : Kept the state trace facility since its 111 * handy for debugging. 112 * Alan Cox : More reset handler fixes. 113 * Alan Cox : Started rewriting the code based on the RFC's 114 * for other useful protocol references see: 115 * Comer, KA9Q NOS, and for a reference on the 116 * difference between specifications and how BSD 117 * works see the 4.4lite source. 118 * A.N.Kuznetsov : Don't time wait on completion of tidy 119 * close. 120 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 121 * Linus Torvalds : Fixed BSD port reuse to work first syn 122 * Alan Cox : Reimplemented timers as per the RFC and using multiple 123 * timers for sanity. 124 * Alan Cox : Small bug fixes, and a lot of new 125 * comments. 126 * Alan Cox : Fixed dual reader crash by locking 127 * the buffers (much like datagram.c) 128 * Alan Cox : Fixed stuck sockets in probe. A probe 129 * now gets fed up of retrying without 130 * (even a no space) answer. 131 * Alan Cox : Extracted closing code better 132 * Alan Cox : Fixed the closing state machine to 133 * resemble the RFC. 134 * Alan Cox : More 'per spec' fixes. 135 * 136 * 137 * To Fix: 138 * Fast path the code. Two things here - fix the window calculation 139 * so it doesn't iterate over the queue, also spot packets with no funny 140 * options arriving in order and process directly. 141 * 142 * Implement RFC 1191 [Path MTU discovery] 143 * Look at the effect of implementing RFC 1337 suggestions and their impact. 144 * Rewrite output state machine to use a single queue and do low window 145 * situations as per the spec (RFC 1122) 146 * Speed up input assembly algorithm. 147 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 148 * could do with it working on IPv4 149 * User settable/learned rtt/max window/mtu 150 * Cope with MTU/device switches when retransmitting in tcp. 151 * Fix the window handling to use PR's new code. 152 * 153 * Change the fundamental structure to a single send queue maintained 154 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 155 * active routes too]). Cut the queue off in tcp_retransmit/ 156 * tcp_transmit. 157 * Change the receive queue to assemble as it goes. This lets us 158 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 159 * tcp_data/tcp_read as well as the window shrink crud. 160 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 161 * tcp_queue_skb seem obvious routines to extract. 162 * 163 * This program is free software; you can redistribute it and/or 164 * modify it under the terms of the GNU General Public License 165 * as published by the Free Software Foundation; either version 166 * 2 of the License, or(at your option) any later version. 167 * 168 * Description of States: 169 * 170 * TCP_SYN_SENT sent a connection request, waiting for ack 171 * 172 * TCP_SYN_RECV received a connection request, sent ack, 173 * waiting for final ack in three-way handshake. 174 * 175 * TCP_ESTABLISHED connection established 176 * 177 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 178 * transmission of remaining buffered data 179 * 180 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 181 * to shutdown 182 * 183 * TCP_CLOSING both sides have shutdown but we still have 184 * data we have to finish sending 185 * 186 * TCP_TIME_WAIT timeout to catch resent junk before entering 187 * closed, can only be entered from FIN_WAIT2 188 * or CLOSING. Required because the other end 189 * may not have gotten our last ACK causing it 190 * to retransmit the data packet (which we ignore) 191 * 192 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 193 * us to finish writing our data and to shutdown 194 * (we have to close() to move on to LAST_ACK) 195 * 196 * TCP_LAST_ACK out side has shutdown after remote has 197 * shutdown. There may still be data in our 198 * buffer that we have to finish sending 199 * 200 * TCP_CLOSE socket is finished 201 */ 202
203 #include <linux/types.h>
204 #include <linux/sched.h>
205 #include <linux/mm.h>
206 #include <linux/time.h>
207 #include <linux/string.h>
208 #include <linux/config.h>
209 #include <linux/socket.h>
210 #include <linux/sockios.h>
211 #include <linux/termios.h>
212 #include <linux/in.h>
213 #include <linux/fcntl.h>
214 #include <linux/inet.h>
215 #include <linux/netdevice.h>
216 #include "snmp.h"
217 #include "ip.h"
218 #include "protocol.h"
219 #include "icmp.h"
220 #include "tcp.h"
221 #include "arp.h"
222 #include <linux/skbuff.h>
223 #include "sock.h"
224 #include "route.h"
225 #include <linux/errno.h>
226 #include <linux/timer.h>
227 #include <asm/system.h>
228 #include <asm/segment.h>
229 #include <linux/mm.h>
230
231 /* 232 * The MSL timer is the 'normal' timer. 233 */ 234
235 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
236
237 #define SEQ_TICK 3
238 unsignedlongseq_offset;
239 structtcp_mibtcp_statistics;
240
241 staticvoidtcp_close(structsock *sk, inttimeout);
242
243
244 /* 245 * The less said about this the better, but it works and will do for 1.2 246 */ 247
248 staticstructwait_queue *master_select_wakeup;
249
250 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 251 { 252 if (a < b)
253 return(a);
254 return(b);
255 } 256
257 #undefSTATE_TRACE 258
259 #ifdefSTATE_TRACE 260 staticchar *statename[]={ 261 "Unused","Established","Syn Sent","Syn Recv",
262 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
263 "Close Wait","Last ACK","Listen","Closing"
264 };
265 #endif 266
267 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 268 { 269 if(sk->state==TCP_ESTABLISHED)
270 tcp_statistics.TcpCurrEstab--;
271 #ifdefSTATE_TRACE 272 if(sk->debug)
273 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
274 #endif 275 /* This is a hack but it doesn't occur often and its going to 276 be a real to fix nicely */ 277
278 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
279 { 280 wake_up_interruptible(&master_select_wakeup);
281 } 282 sk->state=state;
283 if(state==TCP_ESTABLISHED)
284 tcp_statistics.TcpCurrEstab++;
285 } 286
287 /* 288 * This routine picks a TCP windows for a socket based on 289 * the following constraints 290 * 291 * 1. The window can never be shrunk once it is offered (RFC 793) 292 * 2. We limit memory per socket 293 * 294 * For now we use NET2E3's heuristic of offering half the memory 295 * we have handy. All is not as bad as this seems however because 296 * of two things. Firstly we will bin packets even within the window 297 * in order to get the data we are waiting for into the memory limit. 298 * Secondly we bin common duplicate forms at receive time 299 * Better heuristics welcome 300 */ 301
302 inttcp_select_window(structsock *sk)
/* */ 303 { 304 intnew_window = sk->prot->rspace(sk);
305
306 if(sk->window_clamp)
307 new_window=min(sk->window_clamp,new_window);
308 /* 309 * Two things are going on here. First, we don't ever offer a 310 * window less than min(sk->mss, MAX_WINDOW/2). This is the 311 * receiver side of SWS as specified in RFC1122. 312 * Second, we always give them at least the window they 313 * had before, in order to avoid retracting window. This 314 * is technically allowed, but RFC1122 advises against it and 315 * in practice it causes trouble. 316 * 317 * Fixme: This doesn't correctly handle the case where 318 * new_window > sk->window but not by enough to allow for the 319 * shift in sequence space. 320 */ 321 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
322 return(sk->window);
323 return(new_window);
324 } 325
326 /* 327 * Find someone to 'accept'. Must be called with 328 * sk->inuse=1 or cli() 329 */ 330
331 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 332 { 333 structsk_buff *p=skb_peek(&s->receive_queue);
334 if(p==NULL)
335 returnNULL;
336 do 337 { 338 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
339 returnp;
340 p=p->next;
341 } 342 while(p!=(structsk_buff *)&s->receive_queue);
343 returnNULL;
344 } 345
346 /* 347 * Remove a completed connection and return it. This is used by 348 * tcp_accept() to get connections from the queue. 349 */ 350
351 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 352 { 353 structsk_buff *skb;
354 unsignedlongflags;
355 save_flags(flags);
356 cli();
357 skb=tcp_find_established(s);
358 if(skb!=NULL)
359 skb_unlink(skb); /* Take it off the queue */ 360 restore_flags(flags);
361 returnskb;
362 } 363
364 /* 365 * This routine closes sockets which have been at least partially 366 * opened, but not yet accepted. Currently it is only called by 367 * tcp_close, and timeout mirrors the value there. 368 */ 369
370 staticvoidtcp_close_pending (structsock *sk)
/* */ 371 { 372 structsk_buff *skb;
373
374 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
375 { 376 skb->sk->dead=1;
377 tcp_close(skb->sk, 0);
378 kfree_skb(skb, FREE_READ);
379 } 380 return;
381 } 382
383 /* 384 * Enter the time wait state. 385 */ 386
387 staticvoidtcp_time_wait(structsock *sk)
/* */ 388 { 389 tcp_set_state(sk,TCP_TIME_WAIT);
390 sk->shutdown = SHUTDOWN_MASK;
391 if (!sk->dead)
392 sk->state_change(sk);
393 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
394 } 395
396 /* 397 * A socket has timed out on its send queue and wants to do a 398 * little retransmitting. Currently this means TCP. 399 */ 400
401 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 402 { 403 structsk_buff * skb;
404 structproto *prot;
405 structdevice *dev;
406 intct=0;
407
408 prot = sk->prot;
409 skb = sk->send_head;
410
411 while (skb != NULL)
412 { 413 structtcphdr *th;
414 structiphdr *iph;
415 intsize;
416
417 dev = skb->dev;
418 IS_SKB(skb);
419 skb->when = jiffies;
420
421 /* 422 * In general it's OK just to use the old packet. However we 423 * need to use the current ack and window fields. Urg and 424 * urg_ptr could possibly stand to be updated as well, but we 425 * don't keep the necessary data. That shouldn't be a problem, 426 * if the other end is doing the right thing. Since we're 427 * changing the packet, we have to issue a new IP identifier. 428 */ 429
430 iph = (structiphdr *)(skb->data + dev->hard_header_len);
431 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
432 size = skb->len - (((unsignedchar *) th) - skb->data);
433
434 /* 435 * Note: We ought to check for window limits here but 436 * currently this is done (less efficiently) elsewhere. 437 * We do need to check for a route change but can't handle 438 * that until we have the new 1.3.x buffers in. 439 * 440 */ 441
442 iph->id = htons(ip_id_count++);
443 ip_send_check(iph);
444
445 /* 446 * This is not the right way to handle this. We have to 447 * issue an up to date window and ack report with this 448 * retransmit to keep the odd buggy tcp that relies on 449 * the fact BSD does this happy. 450 * We don't however need to recalculate the entire 451 * checksum, so someone wanting a small problem to play 452 * with might like to implement RFC1141/RFC1624 and speed 453 * this up by avoiding a full checksum. 454 */ 455
456 th->ack_seq = ntohl(sk->acked_seq);
457 th->window = ntohs(tcp_select_window(sk));
458 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
459
460 /* 461 * If the interface is (still) up and running, kick it. 462 */ 463
464 if (dev->flags & IFF_UP)
465 { 466 /* 467 * If the packet is still being sent by the device/protocol 468 * below then don't retransmit. This is both needed, and good - 469 * especially with connected mode AX.25 where it stops resends 470 * occurring of an as yet unsent anyway frame! 471 * We still add up the counts as the round trip time wants 472 * adjusting. 473 */ 474 if (sk && !skb_device_locked(skb))
475 { 476 /* Remove it from any existing driver queue first! */ 477 skb_unlink(skb);
478 /* Now queue it */ 479 ip_statistics.IpOutRequests++;
480 dev_queue_xmit(skb, dev, sk->priority);
481 } 482 } 483
484 /* 485 * Count retransmissions 486 */ 487
488 ct++;
489 sk->prot->retransmits ++;
490
491 /* 492 * Only one retransmit requested. 493 */ 494
495 if (!all)
496 break;
497
498 /* 499 * This should cut it off before we send too many packets. 500 */ 501
502 if (ct >= sk->cong_window)
503 break;
504 skb = skb->link3;
505 } 506 } 507
508 /* 509 * Reset the retransmission timer 510 */ 511
512 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 513 { 514 del_timer(&sk->retransmit_timer);
515 sk->ip_xmit_timeout = why;
516 if((int)when < 0)
517 { 518 when=3;
519 printk("Error: Negative timer in xmit_timer\n");
520 } 521 sk->retransmit_timer.expires=when;
522 add_timer(&sk->retransmit_timer);
523 } 524
525 /* 526 * This is the normal code called for timeouts. It does the retransmission 527 * and then does backoff. tcp_do_retransmit is separated out because 528 * tcp_ack needs to send stuff from the retransmit queue without 529 * initiating a backoff. 530 */ 531
532
533 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 534 { 535 tcp_do_retransmit(sk, all);
536
537 /* 538 * Increase the timeout each time we retransmit. Note that 539 * we do not increase the rtt estimate. rto is initialized 540 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 541 * that doubling rto each time is the least we can get away with. 542 * In KA9Q, Karn uses this for the first few times, and then 543 * goes to quadratic. netBSD doubles, but only goes up to *64, 544 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 545 * defined in the protocol as the maximum possible RTT. I guess 546 * we'll have to use something other than TCP to talk to the 547 * University of Mars. 548 * 549 * PAWS allows us longer timeouts and large windows, so once 550 * implemented ftp to mars will work nicely. We will have to fix 551 * the 120 second clamps though! 552 */ 553
554 sk->retransmits++;
555 sk->backoff++;
556 sk->rto = min(sk->rto << 1, 120*HZ);
557 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
558 } 559
560
561 /* 562 * A timer event has trigger a tcp retransmit timeout. The 563 * socket xmit queue is ready and set up to send. Because 564 * the ack receive code keeps the queue straight we do 565 * nothing clever here. 566 */ 567
568 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 569 { 570 if (all)
571 { 572 tcp_retransmit_time(sk, all);
573 return;
574 } 575
576 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 577 /* sk->ssthresh in theory can be zero. I guess that's OK */ 578 sk->cong_count = 0;
579
580 sk->cong_window = 1;
581
582 /* Do the actual retransmit. */ 583 tcp_retransmit_time(sk, all);
584 } 585
586 /* 587 * A write timeout has occurred. Process the after effects. 588 */ 589
590 staticinttcp_write_timeout(structsock *sk)
/* */ 591 { 592 /* 593 * Look for a 'soft' timeout. 594 */ 595 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
596 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
597 { 598 /* 599 * Attempt to recover if arp has changed (unlikely!) or 600 * a route has shifted (not supported prior to 1.3). 601 */ 602 arp_destroy (sk->daddr, 0);
603 ip_route_check (sk->daddr);
604 } 605 /* 606 * Has it gone just too far ? 607 */ 608 if (sk->retransmits > TCP_RETR2)
609 { 610 sk->err = ETIMEDOUT;
611 sk->error_report(sk);
612 del_timer(&sk->retransmit_timer);
613 /* 614 * Time wait the socket 615 */ 616 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
617 { 618 tcp_set_state(sk,TCP_TIME_WAIT);
619 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
620 } 621 else 622 { 623 /* 624 * Clean up time. 625 */ 626 tcp_set_state(sk, TCP_CLOSE);
627 return 0;
628 } 629 } 630 return 1;
631 } 632
633 /* 634 * The TCP retransmit timer. This lacks a few small details. 635 * 636 * 1. An initial rtt timeout on the probe0 should cause what we can 637 * of the first write queue buffer to be split and sent. 638 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 639 * ETIMEDOUT if we know an additional 'soft' error caused this. 640 * tcp_err should save a 'soft error' for us. 641 */ 642
643 staticvoidretransmit_timer(unsignedlongdata)
/* */ 644 { 645 structsock *sk = (structsock*)data;
646 intwhy = sk->ip_xmit_timeout;
647
648 /* 649 * only process if socket is not in use 650 */ 651
652 cli();
653 if (sk->inuse || in_bh)
654 { 655 /* Try again in 1 second */ 656 sk->retransmit_timer.expires = HZ;
657 add_timer(&sk->retransmit_timer);
658 sti();
659 return;
660 } 661
662 sk->inuse = 1;
663 sti();
664
665 /* Always see if we need to send an ack. */ 666
667 if (sk->ack_backlog && !sk->zapped)
668 { 669 sk->prot->read_wakeup (sk);
670 if (! sk->dead)
671 sk->data_ready(sk,0);
672 } 673
674 /* Now we need to figure out why the socket was on the timer. */ 675
676 switch (why)
677 { 678 /* Window probing */ 679 caseTIME_PROBE0:
680 tcp_send_probe0(sk);
681 tcp_write_timeout(sk);
682 break;
683 /* Retransmitting */ 684 caseTIME_WRITE:
685 /* It could be we got here because we needed to send an ack. 686 * So we need to check for that. 687 */ 688 { 689 structsk_buff *skb;
690 unsignedlongflags;
691
692 save_flags(flags);
693 cli();
694 skb = sk->send_head;
695 if (!skb)
696 { 697 restore_flags(flags);
698 } 699 else 700 { 701 /* 702 * Kicked by a delayed ack. Reset timer 703 * correctly now 704 */ 705 if (jiffies < skb->when + sk->rto)
706 { 707 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
708 restore_flags(flags);
709 break;
710 } 711 restore_flags(flags);
712 /* 713 * Retransmission 714 */ 715 sk->prot->retransmit (sk, 0);
716 tcp_write_timeout(sk);
717 } 718 break;
719 } 720 /* Sending Keepalives */ 721 caseTIME_KEEPOPEN:
722 /* 723 * this reset_timer() call is a hack, this is not 724 * how KEEPOPEN is supposed to work. 725 */ 726 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
727
728 /* Send something to keep the connection open. */ 729 if (sk->prot->write_wakeup)
730 sk->prot->write_wakeup (sk);
731 sk->retransmits++;
732 tcp_write_timeout(sk);
733 break;
734 default:
735 printk ("rexmit_timer: timer expired - reason unknown\n");
736 break;
737 } 738 release_sock(sk);
739 } 740
741 /* 742 * This routine is called by the ICMP module when it gets some 743 * sort of error condition. If err < 0 then the socket should 744 * be closed and the error returned to the user. If err > 0 745 * it's just the icmp type << 8 | icmp code. After adjustment 746 * header points to the first 8 bytes of the tcp header. We need 747 * to find the appropriate port. 748 */ 749
750 voidtcp_err(interr, unsignedchar *header, unsignedlongdaddr,
/* */ 751 unsignedlongsaddr, structinet_protocol *protocol)
752 { 753 structtcphdr *th;
754 structsock *sk;
755 structiphdr *iph=(structiphdr *)header;
756
757 header+=4*iph->ihl;
758
759
760 th =(structtcphdr *)header;
761 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
762
763 if (sk == NULL)
764 return;
765
766 if(err<0)
767 { 768 sk->err = -err;
769 sk->error_report(sk);
770 return;
771 } 772
773 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
774 { 775 /* 776 * FIXME: 777 * For now we will just trigger a linear backoff. 778 * The slow start code should cause a real backoff here. 779 */ 780 if (sk->cong_window > 4)
781 sk->cong_window--;
782 return;
783 } 784
785 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ 786
787 /* 788 * If we've already connected we will keep trying 789 * until we time out, or the user gives up. 790 */ 791
792 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
793 { 794 if (sk->state == TCP_SYN_SENT)
795 { 796 tcp_statistics.TcpAttemptFails++;
797 tcp_set_state(sk,TCP_CLOSE);
798 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 799 } 800 sk->err = icmp_err_convert[err & 0xff].errno;
801 } 802 return;
803 } 804
805
806 /* 807 * Walk down the receive queue counting readable data until we hit the end or we find a gap 808 * in the received data queue (ie a frame missing that needs sending to us). Not 809 * sorting using two queues as data arrives makes life so much harder. 810 */ 811
812 staticinttcp_readable(structsock *sk)
/* */ 813 { 814 unsignedlongcounted;
815 unsignedlongamount;
816 structsk_buff *skb;
817 intsum;
818 unsignedlongflags;
819
820 if(sk && sk->debug)
821 printk("tcp_readable: %p - ",sk);
822
823 save_flags(flags);
824 cli();
825 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
826 { 827 restore_flags(flags);
828 if(sk && sk->debug)
829 printk("empty\n");
830 return(0);
831 } 832
833 counted = sk->copied_seq; /* Where we are at the moment */ 834 amount = 0;
835
836 /* 837 * Do until a push or until we are out of data. 838 */ 839
840 do 841 { 842 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ 843 break;
844 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 845 if (skb->h.th->syn)
846 sum++;
847 if (sum > 0)
848 {/* Add it up, move on */ 849 amount += sum;
850 if (skb->h.th->syn)
851 amount--;
852 counted += sum;
853 } 854 /* 855 * Don't count urg data ... but do it in the right place! 856 * Consider: "old_data (ptr is here) URG PUSH data" 857 * The old code would stop at the first push because 858 * it counted the urg (amount==1) and then does amount-- 859 * *after* the loop. This means tcp_readable() always 860 * returned zero if any URG PUSH was in the queue, even 861 * though there was normal data available. If we subtract 862 * the urg data right here, we even get it to work for more 863 * than one URG PUSH skb without normal data. 864 * This means that select() finally works now with urg data 865 * in the queue. Note that rlogin was never affected 866 * because it doesn't use select(); it uses two processes 867 * and a blocking read(). And the queue scan in tcp_read() 868 * was correct. Mike <pall@rz.uni-karlsruhe.de> 869 */ 870 if (skb->h.th->urg)
871 amount--; /* don't count urg data */ 872 if (amount && skb->h.th->psh) break;
873 skb = skb->next;
874 } 875 while(skb != (structsk_buff *)&sk->receive_queue);
876
877 restore_flags(flags);
878 if(sk->debug)
879 printk("got %lu bytes.\n",amount);
880 return(amount);
881 } 882
883 /* 884 * LISTEN is a special case for select.. 885 */ 886 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 887 { 888 if (sel_type == SEL_IN) { 889 intretval;
890
891 sk->inuse = 1;
892 retval = (tcp_find_established(sk) != NULL);
893 release_sock(sk);
894 if (!retval)
895 select_wait(&master_select_wakeup,wait);
896 returnretval;
897 } 898 return 0;
899 } 900
901
902 /* 903 * Wait for a TCP event. 904 * 905 * Note that we don't need to set "sk->inuse", as the upper select layers 906 * take care of normal races (between the test and the event) and we don't 907 * go look at any of the socket buffers directly. 908 */ 909 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 910 { 911 if (sk->state == TCP_LISTEN)
912 returntcp_listen_select(sk, sel_type, wait);
913
914 switch(sel_type) { 915 caseSEL_IN:
916 if (sk->err)
917 return 1;
918 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
919 break;
920
921 if (sk->shutdown & RCV_SHUTDOWN)
922 return 1;
923
924 if (sk->acked_seq == sk->copied_seq)
925 break;
926
927 if (sk->urg_seq != sk->copied_seq ||
928 sk->acked_seq != sk->copied_seq+1 ||
929 sk->urginline || !sk->urg_data)
930 return 1;
931 break;
932
933 caseSEL_OUT:
934 if (sk->shutdown & SEND_SHUTDOWN)
935 return 0;
936 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
937 break;
938 /* 939 * This is now right thanks to a small fix 940 * by Matt Dillon. 941 */ 942
943 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
944 break;
945 return 1;
946
947 caseSEL_EX:
948 if (sk->err || sk->urg_data)
949 return 1;
950 break;
951 } 952 select_wait(sk->sleep, wait);
953 return 0;
954 } 955
956 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 957 { 958 interr;
959 switch(cmd)
960 { 961
962 caseTIOCINQ:
963 #ifdef FIXME /* FIXME: */ 964 caseFIONREAD:
965 #endif 966 { 967 unsignedlongamount;
968
969 if (sk->state == TCP_LISTEN)
970 return(-EINVAL);
971
972 sk->inuse = 1;
973 amount = tcp_readable(sk);
974 release_sock(sk);
975 err=verify_area(VERIFY_WRITE,(void *)arg,
976 sizeof(unsignedlong));
977 if(err)
978 returnerr;
979 put_fs_long(amount,(unsignedlong *)arg);
980 return(0);
981 } 982 caseSIOCATMARK:
983 { 984 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
985
986 err = verify_area(VERIFY_WRITE,(void *) arg,
987 sizeof(unsignedlong));
988 if (err)
989 returnerr;
990 put_fs_long(answ,(int *) arg);
991 return(0);
992 } 993 caseTIOCOUTQ:
994 { 995 unsignedlongamount;
996
997 if (sk->state == TCP_LISTEN) return(-EINVAL);
998 amount = sk->prot->wspace(sk);
999 err=verify_area(VERIFY_WRITE,(void *)arg,
1000 sizeof(unsignedlong));
1001 if(err)
1002 returnerr;
1003 put_fs_long(amount,(unsignedlong *)arg);
1004 return(0);
1005 }1006 default:
1007 return(-EINVAL);
1008 }1009 }1010
1011
1012 /*1013 * This routine computes a TCP checksum. 1014 */1015
1016 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1017 unsignedlongsaddr, unsignedlongdaddr)
1018 {1019 unsignedlongsum;
1020
1021 if (saddr == 0) saddr = ip_my_addr();
1022
1023 /*1024 * stupid, gcc complains when I use just one __asm__ block,1025 * something about too many reloads, but this is just two1026 * instructions longer than what I want1027 */1028 __asm__("
1029 addl %%ecx, %%ebx
1030 adcl %%edx, %%ebx
1031 adcl $0, %%ebx
1032 "
1033 : "=b"(sum)
1034 : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1035 : "bx", "cx", "dx" );
1036 __asm__("
1037 movl %%ecx, %%edx
1038 cld
1039 cmpl $32, %%ecx
1040 jb 2f
1041 shrl $5, %%ecx
1042 clc
1043 1: lodsl
1044 adcl %%eax, %%ebx
1045 lodsl
1046 adcl %%eax, %%ebx
1047 lodsl
1048 adcl %%eax, %%ebx
1049 lodsl
1050 adcl %%eax, %%ebx
1051 lodsl
1052 adcl %%eax, %%ebx
1053 lodsl
1054 adcl %%eax, %%ebx
1055 lodsl
1056 adcl %%eax, %%ebx
1057 lodsl
1058 adcl %%eax, %%ebx
1059 loop 1b
1060 adcl $0, %%ebx
1061 movl %%edx, %%ecx
1062 2: andl $28, %%ecx
1063 je 4f
1064 shrl $2, %%ecx
1065 clc
1066 3: lodsl
1067 adcl %%eax, %%ebx
1068 loop 3b
1069 adcl $0, %%ebx
1070 4: movl $0, %%eax
1071 testw $2, %%dx
1072 je 5f
1073 lodsw
1074 addl %%eax, %%ebx
1075 adcl $0, %%ebx
1076 movw $0, %%ax
1077 5: test $1, %%edx
1078 je 6f
1079 lodsb
1080 addl %%eax, %%ebx
1081 adcl $0, %%ebx
1082 6: movl %%ebx, %%eax
1083 shrl $16, %%eax
1084 addw %%ax, %%bx
1085 adcw $0, %%bx
1086 "
1087 : "=b"(sum)
1088 : "0"(sum), "c"(len), "S"(th)
1089 : "ax", "bx", "cx", "dx", "si" );
1090
1091 /* We only want the bottom 16 bits, but we never cleared the top 16. */1092
1093 return((~sum) & 0xffff);
1094 }1095
1096
1097
1098 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1099 unsignedlongdaddr, intlen, structsock *sk)
1100 {1101 th->check = 0;
1102 th->check = tcp_check(th, len, saddr, daddr);
1103 return;
1104 }1105
1106 /*1107 * This is the main buffer sending routine. We queue the buffer1108 * having checked it is sane seeming.1109 */1110
1111 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1112 {1113 intsize;
1114 structtcphdr * th = skb->h.th;
1115
1116 /*1117 * length of packet (not counting length of pre-tcp headers) 1118 */1119
1120 size = skb->len - ((unsignedchar *) th - skb->data);
1121
1122 /*1123 * Sanity check it.. 1124 */1125
1126 if (size < sizeof(structtcphdr) || size > skb->len)
1127 {1128 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1129 skb, skb->data, th, skb->len);
1130 kfree_skb(skb, FREE_WRITE);
1131 return;
1132 }1133
1134 /*1135 * If we have queued a header size packet.. (these crash a few1136 * tcp stacks if ack is not set)1137 */1138
1139 if (size == sizeof(structtcphdr))
1140 {1141 /* If its got a syn or fin its notionally included in the size..*/1142 if(!th->syn && !th->fin)
1143 {1144 printk("tcp_send_skb: attempt to queue a bogon.\n");
1145 kfree_skb(skb,FREE_WRITE);
1146 return;
1147 }1148 }1149
1150 /*1151 * Actual processing.1152 */1153
1154 tcp_statistics.TcpOutSegs++;
1155 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1156
1157 /*1158 * We must queue if1159 *1160 * a) The right edge of this frame exceeds the window1161 * b) We are retransmitting (Nagle's rule)1162 * c) We have too many packets 'in flight'1163 */1164
1165 if (after(skb->h.seq, sk->window_seq) ||
1166 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1167 sk->packets_out >= sk->cong_window)
1168 {1169 /* checksum will be supplied by tcp_write_xmit. So1170 * we shouldn't need to set it at all. I'm being paranoid */1171 th->check = 0;
1172 if (skb->next != NULL)
1173 {1174 printk("tcp_send_partial: next != NULL\n");
1175 skb_unlink(skb);
1176 }1177 skb_queue_tail(&sk->write_queue, skb);
1178
1179 /*1180 * If we don't fit we have to start the zero window1181 * probes. This is broken - we really need to do a partial1182 * send _first_ (This is what causes the Cisco and PC/TCP1183 * grief).1184 */1185
1186 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1187 sk->send_head == NULL && sk->ack_backlog == 0)
1188 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1189 }1190 else1191 {1192 /*1193 * This is going straight out1194 */1195
1196 th->ack_seq = ntohl(sk->acked_seq);
1197 th->window = ntohs(tcp_select_window(sk));
1198
1199 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1200
1201 sk->sent_seq = sk->write_seq;
1202
1203 /*1204 * This is mad. The tcp retransmit queue is put together1205 * by the ip layer. This causes half the problems with1206 * unroutable FIN's and other things.1207 */1208
1209 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1210
1211 /*1212 * Set for next retransmit based on expected ACK time.1213 * FIXME: We set this every time which means our 1214 * retransmits are really about a window behind.1215 */1216
1217 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1218 }1219 }1220
1221 /*1222 * Locking problems lead us to a messy situation where we can have1223 * multiple partially complete buffers queued up. This is really bad1224 * as we don't want to be sending partial buffers. Fix this with1225 * a semaphore or similar to lock tcp_write per socket.1226 *1227 * These routines are pretty self descriptive.1228 */1229
1230 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1231 {1232 structsk_buff * skb;
1233 unsignedlongflags;
1234
1235 save_flags(flags);
1236 cli();
1237 skb = sk->partial;
1238 if (skb) {1239 sk->partial = NULL;
1240 del_timer(&sk->partial_timer);
1241 }1242 restore_flags(flags);
1243 returnskb;
1244 }1245
1246 /*1247 * Empty the partial queue1248 */1249
1250 staticvoidtcp_send_partial(structsock *sk)
/* */1251 {1252 structsk_buff *skb;
1253
1254 if (sk == NULL)
1255 return;
1256 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1257 tcp_send_skb(sk, skb);
1258 }1259
1260 /*1261 * Queue a partial frame1262 */1263
1264 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1265 {1266 structsk_buff * tmp;
1267 unsignedlongflags;
1268
1269 save_flags(flags);
1270 cli();
1271 tmp = sk->partial;
1272 if (tmp)
1273 del_timer(&sk->partial_timer);
1274 sk->partial = skb;
1275 init_timer(&sk->partial_timer);
1276 /*1277 * Wait up to 1 second for the buffer to fill.1278 */1279 sk->partial_timer.expires = HZ;
1280 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1281 sk->partial_timer.data = (unsignedlong) sk;
1282 add_timer(&sk->partial_timer);
1283 restore_flags(flags);
1284 if (tmp)
1285 tcp_send_skb(sk, tmp);
1286 }1287
1288
1289 /*1290 * This routine sends an ack and also updates the window. 1291 */1292
1293 staticvoidtcp_send_ack(unsignedlongsequence, unsignedlongack,
/* */1294 structsock *sk,
1295 structtcphdr *th, unsignedlongdaddr)
1296 {1297 structsk_buff *buff;
1298 structtcphdr *t1;
1299 structdevice *dev = NULL;
1300 inttmp;
1301
1302 if(sk->zapped)
1303 return; /* We have been reset, we may not send again */1304
1305 /*1306 * We need to grab some memory, and put together an ack,1307 * and then put it into the queue to be sent.1308 */1309
1310 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1311 if (buff == NULL)
1312 {1313 /* 1314 * Force it to send an ack. We don't have to do this1315 * (ACK is unreliable) but its much better use of 1316 * bandwidth on slow links to send a spare ack than1317 * resend packets. 1318 */1319
1320 sk->ack_backlog++;
1321 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1322 {1323 reset_xmit_timer(sk, TIME_WRITE, HZ);
1324 }1325 return;
1326 }1327
1328 /*1329 * Assemble a suitable TCP frame1330 */1331
1332 buff->len = sizeof(structtcphdr);
1333 buff->sk = sk;
1334 buff->localroute = sk->localroute;
1335 t1 =(structtcphdr *) buff->data;
1336
1337 /* 1338 * Put in the IP header and routing stuff. 1339 */1340
1341 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1342 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1343 if (tmp < 0)
1344 {1345 buff->free = 1;
1346 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1347 return;
1348 }1349 buff->len += tmp;
1350 t1 =(structtcphdr *)((char *)t1 +tmp);
1351
1352 memcpy(t1, th, sizeof(*t1));
1353
1354 /*1355 * Swap the send and the receive. 1356 */1357
1358 t1->dest = th->source;
1359 t1->source = th->dest;
1360 t1->seq = ntohl(sequence);
1361 t1->ack = 1;
1362 sk->window = tcp_select_window(sk);
1363 t1->window = ntohs(sk->window);
1364 t1->res1 = 0;
1365 t1->res2 = 0;
1366 t1->rst = 0;
1367 t1->urg = 0;
1368 t1->syn = 0;
1369 t1->psh = 0;
1370 t1->fin = 0;
1371
1372 /*1373 * If we have nothing queued for transmit and the transmit timer1374 * is on we are just doing an ACK timeout and need to switch1375 * to a keepalive.1376 */1377
1378 if (ack == sk->acked_seq)
1379 {1380 sk->ack_backlog = 0;
1381 sk->bytes_rcv = 0;
1382 sk->ack_timed = 0;
1383 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1384 && sk->ip_xmit_timeout == TIME_WRITE)
1385 {1386 if(sk->keepopen) {1387 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1388 }else{1389 delete_timer(sk);
1390 }1391 }1392 }1393
1394 /*1395 * Fill in the packet and send it1396 */1397
1398 t1->ack_seq = ntohl(ack);
1399 t1->doff = sizeof(*t1)/4;
1400 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1401 if (sk->debug)
1402 printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1403 tcp_statistics.TcpOutSegs++;
1404 sk->prot->queue_xmit(sk, dev, buff, 1);
1405 }1406
1407
1408 /* 1409 * This routine builds a generic TCP header. 1410 */1411
1412 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1413 {1414
1415 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1416 th->seq = htonl(sk->write_seq);
1417 th->psh =(push == 0) ? 1 : 0;
1418 th->doff = sizeof(*th)/4;
1419 th->ack = 1;
1420 th->fin = 0;
1421 sk->ack_backlog = 0;
1422 sk->bytes_rcv = 0;
1423 sk->ack_timed = 0;
1424 th->ack_seq = htonl(sk->acked_seq);
1425 sk->window = tcp_select_window(sk);
1426 th->window = htons(sk->window);
1427
1428 return(sizeof(*th));
1429 }1430
1431 /*1432 * This routine copies from a user buffer into a socket,1433 * and starts the transmit system.1434 */1435
1436 staticinttcp_write(structsock *sk, unsignedchar *from,
/* */1437 intlen, intnonblock, unsignedflags)
1438 {1439 intcopied = 0;
1440 intcopy;
1441 inttmp;
1442 structsk_buff *skb;
1443 structsk_buff *send_tmp;
1444 unsignedchar *buff;
1445 structproto *prot;
1446 structdevice *dev = NULL;
1447
1448 sk->inuse=1;
1449 prot = sk->prot;
1450 while(len > 0)
1451 {1452 if (sk->err)
1453 {/* Stop on an error */1454 release_sock(sk);
1455 if (copied)
1456 return(copied);
1457 tmp = -sk->err;
1458 sk->err = 0;
1459 return(tmp);
1460 }1461
1462 /*1463 * First thing we do is make sure that we are established. 1464 */1465
1466 if (sk->shutdown & SEND_SHUTDOWN)
1467 {1468 release_sock(sk);
1469 sk->err = EPIPE;
1470 if (copied)
1471 return(copied);
1472 sk->err = 0;
1473 return(-EPIPE);
1474 }1475
1476 /* 1477 * Wait for a connection to finish.1478 */1479
1480 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1481 {1482 if (sk->err)
1483 {1484 release_sock(sk);
1485 if (copied)
1486 return(copied);
1487 tmp = -sk->err;
1488 sk->err = 0;
1489 return(tmp);
1490 }1491
1492 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1493 {1494 release_sock(sk);
1495 if (copied)
1496 return(copied);
1497
1498 if (sk->err)
1499 {1500 tmp = -sk->err;
1501 sk->err = 0;
1502 return(tmp);
1503 }1504
1505 if (sk->keepopen)
1506 {1507 send_sig(SIGPIPE, current, 0);
1508 }1509 return(-EPIPE);
1510 }1511
1512 if (nonblock || copied)
1513 {1514 release_sock(sk);
1515 if (copied)
1516 return(copied);
1517 return(-EAGAIN);
1518 }1519
1520 release_sock(sk);
1521 cli();
1522
1523 if (sk->state != TCP_ESTABLISHED &&
1524 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1525 {1526 interruptible_sleep_on(sk->sleep);
1527 if (current->signal & ~current->blocked)
1528 {1529 sti();
1530 if (copied)
1531 return(copied);
1532 return(-ERESTARTSYS);
1533 }1534 }1535 sk->inuse = 1;
1536 sti();
1537 }1538
1539 /*1540 * The following code can result in copy <= if sk->mss is ever1541 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1542 * sk->mtu is constant once SYN processing is finished. I.e. we1543 * had better not get here until we've seen his SYN and at least one1544 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1545 * But ESTABLISHED should guarantee that. sk->max_window is by definition1546 * non-decreasing. Note that any ioctl to set user_mss must be done1547 * before the exchange of SYN's. If the initial ack from the other1548 * end has a window of 0, max_window and thus mss will both be 0.1549 */1550
1551 /* 1552 * Now we need to check if we have a half built packet. 1553 */1554
1555 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1556 {1557 inthdrlen;
1558
1559 /* IP header + TCP header */1560 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1561 + sizeof(structtcphdr);
1562
1563 /* Add more stuff to the end of skb->len */1564 if (!(flags & MSG_OOB))
1565 {1566 copy = min(sk->mss - (skb->len - hdrlen), len);
1567 /* FIXME: this is really a bug. */1568 if (copy <= 0)
1569 {1570 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1571 copy = 0;
1572 }1573
1574 memcpy_fromfs(skb->data + skb->len, from, copy);
1575 skb->len += copy;
1576 from += copy;
1577 copied += copy;
1578 len -= copy;
1579 sk->write_seq += copy;
1580 }1581 if ((skb->len - hdrlen) >= sk->mss ||
1582 (flags & MSG_OOB) || !sk->packets_out)
1583 tcp_send_skb(sk, skb);
1584 else1585 tcp_enqueue_partial(skb, sk);
1586 continue;
1587 }1588
1589 /*1590 * We also need to worry about the window.1591 * If window < 1/2 the maximum window we've seen from this1592 * host, don't use it. This is sender side1593 * silly window prevention, as specified in RFC1122.1594 * (Note that this is different than earlier versions of1595 * SWS prevention, e.g. RFC813.). What we actually do is 1596 * use the whole MSS. Since the results in the right1597 * edge of the packet being outside the window, it will1598 * be queued for later rather than sent.1599 */1600
1601 copy = sk->window_seq - sk->write_seq;
1602 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1603 copy = sk->mss;
1604 if (copy > len)
1605 copy = len;
1606
1607 /*1608 * We should really check the window here also. 1609 */1610
1611 send_tmp = NULL;
1612 if (copy < sk->mss && !(flags & MSG_OOB))
1613 {1614 /*1615 * We will release the socket incase we sleep here. 1616 */1617 release_sock(sk);
1618 /*1619 * NB: following must be mtu, because mss can be increased.1620 * mss is always <= mtu 1621 */1622 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1623 sk->inuse = 1;
1624 send_tmp = skb;
1625 }1626 else1627 {1628 /*1629 * We will release the socket incase we sleep here. 1630 */1631 release_sock(sk);
1632 skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1633 sk->inuse = 1;
1634 }1635
1636 /*1637 * If we didn't get any memory, we need to sleep. 1638 */1639
1640 if (skb == NULL)
1641 {1642 sk->socket->flags |= SO_NOSPACE;
1643 if (nonblock)
1644 {1645 release_sock(sk);
1646 if (copied)
1647 return(copied);
1648 return(-EAGAIN);
1649 }1650
1651 /*1652 * FIXME: here is another race condition. 1653 */1654
1655 tmp = sk->wmem_alloc;
1656 release_sock(sk);
1657 cli();
1658 /*1659 * Again we will try to avoid it. 1660 */1661 if (tmp <= sk->wmem_alloc &&
1662 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1663 && sk->err == 0)
1664 {1665 sk->socket->flags &= ~SO_NOSPACE;
1666 interruptible_sleep_on(sk->sleep);
1667 if (current->signal & ~current->blocked)
1668 {1669 sti();
1670 if (copied)
1671 return(copied);
1672 return(-ERESTARTSYS);
1673 }1674 }1675 sk->inuse = 1;
1676 sti();
1677 continue;
1678 }1679
1680 skb->len = 0;
1681 skb->sk = sk;
1682 skb->free = 0;
1683 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1684
1685 buff = skb->data;
1686
1687 /*1688 * FIXME: we need to optimize this.1689 * Perhaps some hints here would be good.1690 */1691
1692 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1693 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1694 if (tmp < 0 )
1695 {1696 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1697 release_sock(sk);
1698 if (copied)
1699 return(copied);
1700 return(tmp);
1701 }1702 skb->len += tmp;
1703 skb->dev = dev;
1704 buff += tmp;
1705 skb->h.th =(structtcphdr *) buff;
1706 tmp = tcp_build_header((structtcphdr *)buff, sk, len-copy);
1707 if (tmp < 0)
1708 {1709 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1710 release_sock(sk);
1711 if (copied)
1712 return(copied);
1713 return(tmp);
1714 }1715
1716 if (flags & MSG_OOB)
1717 {1718 ((structtcphdr *)buff)->urg = 1;
1719 ((structtcphdr *)buff)->urg_ptr = ntohs(copy);
1720 }1721 skb->len += tmp;
1722 memcpy_fromfs(buff+tmp, from, copy);
1723
1724 from += copy;
1725 copied += copy;
1726 len -= copy;
1727 skb->len += copy;
1728 skb->free = 0;
1729 sk->write_seq += copy;
1730
1731 if (send_tmp != NULL && sk->packets_out)
1732 {1733 tcp_enqueue_partial(send_tmp, sk);
1734 continue;
1735 }1736 tcp_send_skb(sk, skb);
1737 }1738 sk->err = 0;
1739
1740 /*1741 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1742 * interactive fast network servers. It's meant to be on and1743 * it really improves the throughput though not the echo time1744 * on my slow slip link - Alan1745 */1746
1747 /*1748 * Avoid possible race on send_tmp - c/o Johannes Stille 1749 */1750
1751 if(sk->partial && ((!sk->packets_out)
1752 /* If not nagling we can send on the before case too.. */1753 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1754 ))
1755 tcp_send_partial(sk);
1756
1757 release_sock(sk);
1758 return(copied);
1759 }1760
1761 /*1762 * This is just a wrapper. 1763 */1764
1765 staticinttcp_sendto(structsock *sk, unsignedchar *from,
/* */1766 intlen, intnonblock, unsignedflags,
1767 structsockaddr_in *addr, intaddr_len)
1768 {1769 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1770 return -EINVAL;
1771 if (sk->state == TCP_CLOSE)
1772 return -ENOTCONN;
1773 if (addr_len < sizeof(*addr))
1774 return -EINVAL;
1775 if (addr->sin_family && addr->sin_family != AF_INET)
1776 return -EINVAL;
1777 if (addr->sin_port != sk->dummy_th.dest)
1778 return -EISCONN;
1779 if (addr->sin_addr.s_addr != sk->daddr)
1780 return -EISCONN;
1781 returntcp_write(sk, from, len, nonblock, flags);
1782 }1783
1784
1785 /*1786 * Send an ack if one is backlogged at this point. Ought to merge1787 * this with tcp_send_ack().1788 */1789
1790 staticvoidtcp_read_wakeup(structsock *sk)
/* */1791 {1792 inttmp;
1793 structdevice *dev = NULL;
1794 structtcphdr *t1;
1795 structsk_buff *buff;
1796
1797 if (!sk->ack_backlog)
1798 return;
1799
1800 /*1801 * FIXME: we need to put code here to prevent this routine from1802 * being called. Being called once in a while is ok, so only check1803 * if this is the second time in a row.1804 */1805
1806 /*1807 * We need to grab some memory, and put together an ack,1808 * and then put it into the queue to be sent.1809 */1810
1811 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1812 if (buff == NULL)
1813 {1814 /* Try again real soon. */1815 reset_xmit_timer(sk, TIME_WRITE, HZ);
1816 return;
1817 }1818
1819 buff->len = sizeof(structtcphdr);
1820 buff->sk = sk;
1821 buff->localroute = sk->localroute;
1822
1823 /*1824 * Put in the IP header and routing stuff. 1825 */1826
1827 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1828 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1829 if (tmp < 0)
1830 {1831 buff->free = 1;
1832 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1833 return;
1834 }1835
1836 buff->len += tmp;
1837 t1 =(structtcphdr *)(buff->data +tmp);
1838
1839 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1840 t1->seq = htonl(sk->sent_seq);
1841 t1->ack = 1;
1842 t1->res1 = 0;
1843 t1->res2 = 0;
1844 t1->rst = 0;
1845 t1->urg = 0;
1846 t1->syn = 0;
1847 t1->psh = 0;
1848 sk->ack_backlog = 0;
1849 sk->bytes_rcv = 0;
1850 sk->window = tcp_select_window(sk);
1851 t1->window = ntohs(sk->window);
1852 t1->ack_seq = ntohl(sk->acked_seq);
1853 t1->doff = sizeof(*t1)/4;
1854 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1855 sk->prot->queue_xmit(sk, dev, buff, 1);
1856 tcp_statistics.TcpOutSegs++;
1857 }1858
1859
1860 /*1861 * FIXME:1862 * This routine frees used buffers.1863 * It should consider sending an ACK to let the1864 * other end know we now have a bigger window.1865 */1866
1867 staticvoidcleanup_rbuf(structsock *sk)
/* */1868 {1869 unsignedlongflags;
1870 unsignedlongleft;
1871 structsk_buff *skb;
1872 unsignedlongrspace;
1873
1874 if(sk->debug)
1875 printk("cleaning rbuf for sk=%p\n", sk);
1876
1877 save_flags(flags);
1878 cli();
1879
1880 left = sk->prot->rspace(sk);
1881
1882 /*1883 * We have to loop through all the buffer headers,1884 * and try to free up all the space we can.1885 */1886
1887 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1888 {1889 if (!skb->used || skb->users)
1890 break;
1891 skb_unlink(skb);
1892 skb->sk = sk;
1893 kfree_skb(skb, FREE_READ);
1894 }1895
1896 restore_flags(flags);
1897
1898 /*1899 * FIXME:1900 * At this point we should send an ack if the difference1901 * in the window, and the amount of space is bigger than1902 * TCP_WINDOW_DIFF.1903 */1904
1905 if(sk->debug)
1906 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1907 left);
1908 if ((rspace=sk->prot->rspace(sk)) != left)
1909 {1910 /*1911 * This area has caused the most trouble. The current strategy1912 * is to simply do nothing if the other end has room to send at1913 * least 3 full packets, because the ack from those will auto-1914 * matically update the window. If the other end doesn't think1915 * we have much space left, but we have room for at least 1 more1916 * complete packet than it thinks we do, we will send an ack1917 * immediately. Otherwise we will wait up to .5 seconds in case1918 * the user reads some more.1919 */1920 sk->ack_backlog++;
1921 /*1922 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1923 * if the other end is offering a window smaller than the agreed on MSS1924 * (called sk->mtu here). In theory there's no connection between send1925 * and receive, and so no reason to think that they're going to send1926 * small packets. For the moment I'm using the hack of reducing the mss1927 * only on the send side, so I'm putting mtu here.1928 */1929
1930 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1931 {1932 /* Send an ack right now. */1933 tcp_read_wakeup(sk);
1934 }1935 else1936 {1937 /* Force it to send an ack soon. */1938 intwas_active = del_timer(&sk->retransmit_timer);
1939 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1940 {1941 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1942 }1943 else1944 add_timer(&sk->retransmit_timer);
1945 }1946 }1947 }1948
1949
1950 /*1951 * Handle reading urgent data. BSD has very simple semantics for1952 * this, no blocking and very strange errors 8)1953 */1954
1955 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */1956 unsignedchar *to, intlen, unsignedflags)
1957 {1958 /*1959 * No URG data to read1960 */1961 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1962 return -EINVAL; /* Yes this is right ! */1963
1964 if (sk->err)
1965 {1966 inttmp = -sk->err;
1967 sk->err = 0;
1968 returntmp;
1969 }1970
1971 if (sk->state == TCP_CLOSE || sk->done)
1972 {1973 if (!sk->done) {1974 sk->done = 1;
1975 return 0;
1976 }1977 return -ENOTCONN;
1978 }1979
1980 if (sk->shutdown & RCV_SHUTDOWN)
1981 {1982 sk->done = 1;
1983 return 0;
1984 }1985 sk->inuse = 1;
1986 if (sk->urg_data & URG_VALID)
1987 {1988 charc = sk->urg_data;
1989 if (!(flags & MSG_PEEK))
1990 sk->urg_data = URG_READ;
1991 put_fs_byte(c, to);
1992 release_sock(sk);
1993 return 1;
1994 }1995 release_sock(sk);
1996
1997 /*1998 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and1999 * the available implementations agree in this case:2000 * this call should never block, independent of the2001 * blocking state of the socket.2002 * Mike <pall@rz.uni-karlsruhe.de>2003 */2004 return -EAGAIN;
2005 }2006
2007
2008 /*2009 * This routine copies from a sock struct into the user buffer. 2010 */2011
2012 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */2013 intlen, intnonblock, unsignedflags)
2014 {2015 structwait_queuewait = {current, NULL};
2016 intcopied = 0;
2017 unsignedlongpeek_seq;
2018 volatileunsignedlong *seq; /* So gcc doesn't overoptimise */2019 unsignedlongused;
2020
2021 /* 2022 * This error should be checked. 2023 */2024
2025 if (sk->state == TCP_LISTEN)
2026 return -ENOTCONN;
2027
2028 /*2029 * Urgent data needs to be handled specially. 2030 */2031
2032 if (flags & MSG_OOB)
2033 returntcp_read_urg(sk, nonblock, to, len, flags);
2034
2035 /*2036 * Copying sequence to update. This is volatile to handle2037 * the multi-reader case neatly (memcpy_to/fromfs might be 2038 * inline and thus not flush cached variables otherwise).2039 */2040
2041 peek_seq = sk->copied_seq;
2042 seq = &sk->copied_seq;
2043 if (flags & MSG_PEEK)
2044 seq = &peek_seq;
2045
2046 add_wait_queue(sk->sleep, &wait);
2047 sk->inuse = 1;
2048 while (len > 0)
2049 {2050 structsk_buff * skb;
2051 unsignedlongoffset;
2052
2053 /*2054 * Are we at urgent data? Stop if we have read anything.2055 */2056
2057 if (copied && sk->urg_data && sk->urg_seq == *seq)
2058 break;
2059
2060 /*2061 * Next get a buffer.2062 */2063
2064 current->state = TASK_INTERRUPTIBLE;
2065
2066 skb = skb_peek(&sk->receive_queue);
2067 do2068 {2069 if (!skb)
2070 break;
2071 if (before(*seq, skb->h.th->seq))
2072 break;
2073 offset = *seq - skb->h.th->seq;
2074 if (skb->h.th->syn)
2075 offset--;
2076 if (offset < skb->len)
2077 gotofound_ok_skb;
2078 if (skb->h.th->fin)
2079 gotofound_fin_ok;
2080 if (!(flags & MSG_PEEK))
2081 skb->used = 1;
2082 skb = skb->next;
2083 }2084 while (skb != (structsk_buff *)&sk->receive_queue);
2085
2086 if (copied)
2087 break;
2088
2089 if (sk->err)
2090 {2091 copied = -sk->err;
2092 sk->err = 0;
2093 break;
2094 }2095
2096 if (sk->state == TCP_CLOSE)
2097 {2098 if (!sk->done)
2099 {2100 sk->done = 1;
2101 break;
2102 }2103 copied = -ENOTCONN;
2104 break;
2105 }2106
2107 if (sk->shutdown & RCV_SHUTDOWN)
2108 {2109 sk->done = 1;
2110 break;
2111 }2112
2113 if (nonblock)
2114 {2115 copied = -EAGAIN;
2116 break;
2117 }2118
2119 cleanup_rbuf(sk);
2120 release_sock(sk);
2121 sk->socket->flags |= SO_WAITDATA;
2122 schedule();
2123 sk->socket->flags &= ~SO_WAITDATA;
2124 sk->inuse = 1;
2125
2126 if (current->signal & ~current->blocked)
2127 {2128 copied = -ERESTARTSYS;
2129 break;
2130 }2131 continue;
2132
2133 found_ok_skb:
2134 /*2135 * Lock the buffer. We can be fairly relaxed as2136 * an interrupt will never steal a buffer we are 2137 * using unless I've missed something serious in2138 * tcp_data.2139 */2140
2141 skb->users++;
2142
2143 /*2144 * Ok so how much can we use ? 2145 */2146
2147 used = skb->len - offset;
2148 if (len < used)
2149 used = len;
2150 /*2151 * Do we have urgent data here? 2152 */2153
2154 if (sk->urg_data)
2155 {2156 unsignedlongurg_offset = sk->urg_seq - *seq;
2157 if (urg_offset < used)
2158 {2159 if (!urg_offset)
2160 {2161 if (!sk->urginline)
2162 {2163 ++*seq;
2164 offset++;
2165 used--;
2166 }2167 }2168 else2169 used = urg_offset;
2170 }2171 }2172
2173 /*2174 * Copy it - We _MUST_ update *seq first so that we2175 * don't ever double read when we have dual readers2176 */2177
2178 *seq += used;
2179
2180 /*2181 * This memcpy_tofs can sleep. If it sleeps and we2182 * do a second read it relies on the skb->users to avoid2183 * a crash when cleanup_rbuf() gets called.2184 */2185
2186 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2187 skb->h.th->doff*4 + offset, used);
2188 copied += used;
2189 len -= used;
2190 to += used;
2191
2192 /*2193 * We now will not sleep again until we are finished2194 * with skb. Sorry if you are doing the SMP port2195 * but you'll just have to fix it neatly ;)2196 */2197
2198 skb->users --;
2199
2200 if (after(sk->copied_seq,sk->urg_seq))
2201 sk->urg_data = 0;
2202 if (used + offset < skb->len)
2203 continue;
2204
2205 /*2206 * Process the FIN.2207 */2208
2209 if (skb->h.th->fin)
2210 gotofound_fin_ok;
2211 if (flags & MSG_PEEK)
2212 continue;
2213 skb->used = 1;
2214 continue;
2215
2216 found_fin_ok:
2217 ++*seq;
2218 if (flags & MSG_PEEK)
2219 break;
2220
2221 /*2222 * All is done2223 */2224
2225 skb->used = 1;
2226 sk->shutdown |= RCV_SHUTDOWN;
2227 break;
2228
2229 }2230 remove_wait_queue(sk->sleep, &wait);
2231 current->state = TASK_RUNNING;
2232
2233 /* Clean up data we have read: This will do ACK frames */2234 cleanup_rbuf(sk);
2235 release_sock(sk);
2236 returncopied;
2237 }2238
2239 /*2240 * State processing on a close. This implements the state shift for2241 * sending our FIN frame. Note that we only send a FIN for some 2242 * states. A shutdown() may have already sent the FIN, or we may be2243 * closed.2244 */2245
2246 staticinttcp_close_state(structsock *sk, intdead)
/* */2247 {2248 intns=TCP_CLOSE;
2249 intsend_fin=0;
2250 switch(sk->state)
2251 {2252 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2253 break;
2254 caseTCP_SYN_RECV:
2255 caseTCP_ESTABLISHED: /* Closedown begin */2256 ns=TCP_FIN_WAIT1;
2257 send_fin=1;
2258 break;
2259 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2260 caseTCP_FIN_WAIT2:
2261 caseTCP_CLOSING:
2262 ns=sk->state;
2263 break;
2264 caseTCP_CLOSE:
2265 caseTCP_LISTEN:
2266 break;
2267 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2268 wait only for the ACK */2269 ns=TCP_LAST_ACK;
2270 send_fin=1;
2271 }2272
2273 tcp_set_state(sk,ns);
2274
2275 /*2276 * This is a (useful) BSD violating of the RFC. There is a2277 * problem with TCP as specified in that the other end could2278 * keep a socket open forever with no application left this end.2279 * We use a 3 minute timeout (about the same as BSD) then kill2280 * our end. If they send after that then tough - BUT: long enough2281 * that we won't make the old 4*rto = almost no time - whoops2282 * reset mistake.2283 */2284 if(dead && ns==TCP_FIN_WAIT2)
2285 {2286 inttimer_active=del_timer(&sk->timer);
2287 if(timer_active)
2288 add_timer(&sk->timer);
2289 else2290 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2291 }2292
2293 returnsend_fin;
2294 }2295
2296 /*2297 * Send a fin.2298 */2299
2300 staticvoidtcp_send_fin(structsock *sk)
/* */2301 {2302 structproto *prot =(structproto *)sk->prot;
2303 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2304 structtcphdr *t1;
2305 structsk_buff *buff;
2306 structdevice *dev=NULL;
2307 inttmp;
2308
2309 release_sock(sk); /* in case the malloc sleeps. */2310
2311 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2312 sk->inuse = 1;
2313
2314 if (buff == NULL)
2315 {2316 /* This is a disaster if it occurs */2317 printk("tcp_send_fin: Impossible malloc failure");
2318 return;
2319 }2320
2321 /*2322 * Administrivia2323 */2324
2325 buff->sk = sk;
2326 buff->len = sizeof(*t1);
2327 buff->localroute = sk->localroute;
2328 t1 =(structtcphdr *) buff->data;
2329
2330 /*2331 * Put in the IP header and routing stuff. 2332 */2333
2334 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2335 IPPROTO_TCP, sk->opt,
2336 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2337 if (tmp < 0)
2338 {2339 intt;
2340 /*2341 * Finish anyway, treat this as a send that got lost. 2342 * (Not good).2343 */2344
2345 buff->free = 1;
2346 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2347 sk->write_seq++;
2348 t=del_timer(&sk->timer);
2349 if(t)
2350 add_timer(&sk->timer);
2351 else2352 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2353 return;
2354 }2355
2356 /*2357 * We ought to check if the end of the queue is a buffer and2358 * if so simply add the fin to that buffer, not send it ahead.2359 */2360
2361 t1 =(structtcphdr *)((char *)t1 +tmp);
2362 buff->len += tmp;
2363 buff->dev = dev;
2364 memcpy(t1, th, sizeof(*t1));
2365 t1->seq = ntohl(sk->write_seq);
2366 sk->write_seq++;
2367 buff->h.seq = sk->write_seq;
2368 t1->ack = 1;
2369 t1->ack_seq = ntohl(sk->acked_seq);
2370 t1->window = ntohs(sk->window=tcp_select_window(sk));
2371 t1->fin = 1;
2372 t1->rst = 0;
2373 t1->doff = sizeof(*t1)/4;
2374 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2375
2376 /*2377 * If there is data in the write queue, the fin must be appended to2378 * the write queue.2379 */2380
2381 if (skb_peek(&sk->write_queue) != NULL)
2382 {2383 buff->free = 0;
2384 if (buff->next != NULL)
2385 {2386 printk("tcp_send_fin: next != NULL\n");
2387 skb_unlink(buff);
2388 }2389 skb_queue_tail(&sk->write_queue, buff);
2390 }2391 else2392 {2393 sk->sent_seq = sk->write_seq;
2394 sk->prot->queue_xmit(sk, dev, buff, 0);
2395 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2396 }2397 }2398
2399 /*2400 * Shutdown the sending side of a connection. Much like close except2401 * that we don't receive shut down or set sk->dead=1.2402 */2403
2404 voidtcp_shutdown(structsock *sk, inthow)
/* */2405 {2406 /*2407 * We need to grab some memory, and put together a FIN,2408 * and then put it into the queue to be sent.2409 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2410 */2411
2412 if (!(how & SEND_SHUTDOWN))
2413 return;
2414
2415 /*2416 * If we've already sent a FIN, or its a closed state2417 */2418
2419 if (sk->state == TCP_FIN_WAIT1 ||
2420 sk->state == TCP_FIN_WAIT2 ||
2421 sk->state == TCP_CLOSING ||
2422 sk->state == TCP_LAST_ACK ||
2423 sk->state == TCP_TIME_WAIT ||
2424 sk->state == TCP_CLOSE ||
2425 sk->state == TCP_LISTEN2426 )
2427 {2428 return;
2429 }2430 sk->inuse = 1;
2431
2432 /*2433 * flag that the sender has shutdown2434 */2435
2436 sk->shutdown |= SEND_SHUTDOWN;
2437
2438 /*2439 * Clear out any half completed packets. 2440 */2441
2442 if (sk->partial)
2443 tcp_send_partial(sk);
2444
2445 /*2446 * FIN if needed2447 */2448
2449 if(tcp_close_state(sk,0))
2450 tcp_send_fin(sk);
2451
2452 release_sock(sk);
2453 }2454
2455
2456 staticint2457 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2458 intto_len, intnonblock, unsignedflags,
2459 structsockaddr_in *addr, int *addr_len)
2460 {2461 intresult;
2462
2463 /* 2464 * Have to check these first unlike the old code. If 2465 * we check them after we lose data on an error2466 * which is wrong 2467 */2468
2469 if(addr_len)
2470 *addr_len = sizeof(*addr);
2471 result=tcp_read(sk, to, to_len, nonblock, flags);
2472
2473 if (result < 0)
2474 return(result);
2475
2476 if(addr)
2477 {2478 addr->sin_family = AF_INET;
2479 addr->sin_port = sk->dummy_th.dest;
2480 addr->sin_addr.s_addr = sk->daddr;
2481 }2482 return(result);
2483 }2484
2485
2486 /*2487 * This routine will send an RST to the other tcp. 2488 */2489
2490 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2491 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2492 {2493 structsk_buff *buff;
2494 structtcphdr *t1;
2495 inttmp;
2496 structdevice *ndev=NULL;
2497
2498 /*2499 * Cannot reset a reset (Think about it).2500 */2501
2502 if(th->rst)
2503 return;
2504
2505 /*2506 * We need to grab some memory, and put together an RST,2507 * and then put it into the queue to be sent.2508 */2509
2510 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2511 if (buff == NULL)
2512 return;
2513
2514 buff->len = sizeof(*t1);
2515 buff->sk = NULL;
2516 buff->dev = dev;
2517 buff->localroute = 0;
2518
2519 t1 =(structtcphdr *) buff->data;
2520
2521 /*2522 * Put in the IP header and routing stuff. 2523 */2524
2525 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2526 sizeof(structtcphdr),tos,ttl);
2527 if (tmp < 0)
2528 {2529 buff->free = 1;
2530 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2531 return;
2532 }2533
2534 t1 =(structtcphdr *)((char *)t1 +tmp);
2535 buff->len += tmp;
2536 memcpy(t1, th, sizeof(*t1));
2537
2538 /*2539 * Swap the send and the receive. 2540 */2541
2542 t1->dest = th->source;
2543 t1->source = th->dest;
2544 t1->rst = 1;
2545 t1->window = 0;
2546
2547 if(th->ack)
2548 {2549 t1->ack = 0;
2550 t1->seq = th->ack_seq;
2551 t1->ack_seq = 0;
2552 }2553 else2554 {2555 t1->ack = 1;
2556 if(!th->syn)
2557 t1->ack_seq=htonl(th->seq);
2558 else2559 t1->ack_seq=htonl(th->seq+1);
2560 t1->seq=0;
2561 }2562
2563 t1->syn = 0;
2564 t1->urg = 0;
2565 t1->fin = 0;
2566 t1->psh = 0;
2567 t1->doff = sizeof(*t1)/4;
2568 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2569 prot->queue_xmit(NULL, ndev, buff, 1);
2570 tcp_statistics.TcpOutSegs++;
2571 }2572
2573
2574 /*2575 * Look for tcp options. Parses everything but only knows about MSS.2576 * This routine is always called with the packet containing the SYN.2577 * However it may also be called with the ack to the SYN. So you2578 * can't assume this is always the SYN. It's always called after2579 * we have set up sk->mtu to our own MTU.2580 *2581 * We need at minimum to add PAWS support here. Possibly large windows2582 * as Linux gets deployed on 100Mb/sec networks.2583 */2584
2585 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2586 {2587 unsignedchar *ptr;
2588 intlength=(th->doff*4)-sizeof(structtcphdr);
2589 intmss_seen = 0;
2590
2591 ptr = (unsignedchar *)(th + 1);
2592
2593 while(length>0)
2594 {2595 intopcode=*ptr++;
2596 intopsize=*ptr++;
2597 switch(opcode)
2598 {2599 caseTCPOPT_EOL:
2600 return;
2601 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2602 length--;
2603 ptr--; /* the opsize=*ptr++ above was a mistake */2604 continue;
2605
2606 default:
2607 if(opsize<=2) /* Avoid silly options looping forever */2608 return;
2609 switch(opcode)
2610 {2611 caseTCPOPT_MSS:
2612 if(opsize==4 && th->syn)
2613 {2614 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2615 mss_seen = 1;
2616 }2617 break;
2618 /* Add other options here as people feel the urge to implement stuff like large windows */2619 }2620 ptr+=opsize-2;
2621 length-=opsize;
2622 }2623 }2624 if (th->syn)
2625 {2626 if (! mss_seen)
2627 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2628 }2629 #ifdefCONFIG_INET_PCTCP2630 sk->mss = min(sk->max_window >> 1, sk->mtu);
2631 #else2632 sk->mss = min(sk->max_window, sk->mtu);
2633 #endif2634 }2635
2636 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2637 {2638 dst = ntohl(dst);
2639 if (IN_CLASSA(dst))
2640 returnhtonl(IN_CLASSA_NET);
2641 if (IN_CLASSB(dst))
2642 returnhtonl(IN_CLASSB_NET);
2643 returnhtonl(IN_CLASSC_NET);
2644 }2645
2646 /*2647 * Default sequence number picking algorithm.2648 * As close as possible to RFC 793, which2649 * suggests using a 250kHz clock.2650 * Further reading shows this assumes 2MB/s networks.2651 * For 10MB/s ethernet, a 1MHz clock is appropriate.2652 * That's funny, Linux has one built in! Use it!2653 */2654
2655 externinlineunsignedlongtcp_init_seq(void)
/* */2656 {2657 structtimevaltv;
2658 do_gettimeofday(&tv);
2659 returntv.tv_usec+tv.tv_sec*1000000;
2660 }2661
2662 /*2663 * This routine handles a connection request.2664 * It should make sure we haven't already responded.2665 * Because of the way BSD works, we have to send a syn/ack now.2666 * This also means it will be harder to close a socket which is2667 * listening.2668 */2669
2670 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2671 unsignedlongdaddr, unsignedlongsaddr,
2672 structoptions *opt, structdevice *dev, unsignedlongseq)
2673 {2674 structsk_buff *buff;
2675 structtcphdr *t1;
2676 unsignedchar *ptr;
2677 structsock *newsk;
2678 structtcphdr *th;
2679 structdevice *ndev=NULL;
2680 inttmp;
2681 structrtable *rt;
2682
2683 th = skb->h.th;
2684
2685 /* If the socket is dead, don't accept the connection. */2686 if (!sk->dead)
2687 {2688 sk->data_ready(sk,0);
2689 }2690 else2691 {2692 if(sk->debug)
2693 printk("Reset on %p: Connect on dead socket.\n",sk);
2694 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2695 tcp_statistics.TcpAttemptFails++;
2696 kfree_skb(skb, FREE_READ);
2697 return;
2698 }2699
2700 /*2701 * Make sure we can accept more. This will prevent a2702 * flurry of syns from eating up all our memory.2703 */2704
2705 if (sk->ack_backlog >= sk->max_ack_backlog)
2706 {2707 tcp_statistics.TcpAttemptFails++;
2708 kfree_skb(skb, FREE_READ);
2709 return;
2710 }2711
2712 /*2713 * We need to build a new sock struct.2714 * It is sort of bad to have a socket without an inode attached2715 * to it, but the wake_up's will just wake up the listening socket,2716 * and if the listening socket is destroyed before this is taken2717 * off of the queue, this will take care of it.2718 */2719
2720 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2721 if (newsk == NULL)
2722 {2723 /* just ignore the syn. It will get retransmitted. */2724 tcp_statistics.TcpAttemptFails++;
2725 kfree_skb(skb, FREE_READ);
2726 return;
2727 }2728
2729 memcpy(newsk, sk, sizeof(*newsk));
2730 skb_queue_head_init(&newsk->write_queue);
2731 skb_queue_head_init(&newsk->receive_queue);
2732 newsk->send_head = NULL;
2733 newsk->send_tail = NULL;
2734 skb_queue_head_init(&newsk->back_log);
2735 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2736 newsk->rto = TCP_TIMEOUT_INIT;
2737 newsk->mdev = 0;
2738 newsk->max_window = 0;
2739 newsk->cong_window = 1;
2740 newsk->cong_count = 0;
2741 newsk->ssthresh = 0;
2742 newsk->backoff = 0;
2743 newsk->blog = 0;
2744 newsk->intr = 0;
2745 newsk->proc = 0;
2746 newsk->done = 0;
2747 newsk->partial = NULL;
2748 newsk->pair = NULL;
2749 newsk->wmem_alloc = 0;
2750 newsk->rmem_alloc = 0;
2751 newsk->localroute = sk->localroute;
2752
2753 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2754
2755 newsk->err = 0;
2756 newsk->shutdown = 0;
2757 newsk->ack_backlog = 0;
2758 newsk->acked_seq = skb->h.th->seq+1;
2759 newsk->copied_seq = skb->h.th->seq+1;
2760 newsk->fin_seq = skb->h.th->seq;
2761 newsk->state = TCP_SYN_RECV;
2762 newsk->timeout = 0;
2763 newsk->ip_xmit_timeout = 0;
2764 newsk->write_seq = seq;
2765 newsk->window_seq = newsk->write_seq;
2766 newsk->rcv_ack_seq = newsk->write_seq;
2767 newsk->urg_data = 0;
2768 newsk->retransmits = 0;
2769 newsk->linger=0;
2770 newsk->destroy = 0;
2771 init_timer(&newsk->timer);
2772 newsk->timer.data = (unsignedlong)newsk;
2773 newsk->timer.function = &net_timer;
2774 init_timer(&newsk->retransmit_timer);
2775 newsk->retransmit_timer.data = (unsignedlong)newsk;
2776 newsk->retransmit_timer.function=&retransmit_timer;
2777 newsk->dummy_th.source = skb->h.th->dest;
2778 newsk->dummy_th.dest = skb->h.th->source;
2779
2780 /*2781 * Swap these two, they are from our point of view. 2782 */2783
2784 newsk->daddr = saddr;
2785 newsk->saddr = daddr;
2786
2787 put_sock(newsk->num,newsk);
2788 newsk->dummy_th.res1 = 0;
2789 newsk->dummy_th.doff = 6;
2790 newsk->dummy_th.fin = 0;
2791 newsk->dummy_th.syn = 0;
2792 newsk->dummy_th.rst = 0;
2793 newsk->dummy_th.psh = 0;
2794 newsk->dummy_th.ack = 0;
2795 newsk->dummy_th.urg = 0;
2796 newsk->dummy_th.res2 = 0;
2797 newsk->acked_seq = skb->h.th->seq + 1;
2798 newsk->copied_seq = skb->h.th->seq + 1;
2799 newsk->socket = NULL;
2800
2801 /*2802 * Grab the ttl and tos values and use them 2803 */2804
2805 newsk->ip_ttl=sk->ip_ttl;
2806 newsk->ip_tos=skb->ip_hdr->tos;
2807
2808 /*2809 * Use 512 or whatever user asked for 2810 */2811
2812 /*2813 * Note use of sk->user_mss, since user has no direct access to newsk 2814 */2815
2816 rt=ip_rt_route(saddr, NULL,NULL);
2817
2818 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2819 newsk->window_clamp = rt->rt_window;
2820 else2821 newsk->window_clamp = 0;
2822
2823 if (sk->user_mss)
2824 newsk->mtu = sk->user_mss;
2825 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
2826 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2827 else2828 {2829 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */2830 if ((saddr ^ daddr) & default_mask(saddr))
2831 #else2832 if ((saddr ^ daddr) & dev->pa_mask)
2833 #endif2834 newsk->mtu = 576 - HEADER_SIZE;
2835 else2836 newsk->mtu = MAX_WINDOW;
2837 }2838
2839 /*2840 * But not bigger than device MTU 2841 */2842
2843 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2844
2845 /*2846 * This will min with what arrived in the packet 2847 */2848
2849 tcp_options(newsk,skb->h.th);
2850
2851 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2852 if (buff == NULL)
2853 {2854 sk->err = -ENOMEM;
2855 newsk->dead = 1;
2856 newsk->state = TCP_CLOSE;
2857 /* And this will destroy it */2858 release_sock(newsk);
2859 kfree_skb(skb, FREE_READ);
2860 tcp_statistics.TcpAttemptFails++;
2861 return;
2862 }2863
2864 buff->len = sizeof(structtcphdr)+4;
2865 buff->sk = newsk;
2866 buff->localroute = newsk->localroute;
2867
2868 t1 =(structtcphdr *) buff->data;
2869
2870 /*2871 * Put in the IP header and routing stuff. 2872 */2873
2874 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2875 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2876
2877 /*2878 * Something went wrong. 2879 */2880
2881 if (tmp < 0)
2882 {2883 sk->err = tmp;
2884 buff->free = 1;
2885 kfree_skb(buff,FREE_WRITE);
2886 newsk->dead = 1;
2887 newsk->state = TCP_CLOSE;
2888 release_sock(newsk);
2889 skb->sk = sk;
2890 kfree_skb(skb, FREE_READ);
2891 tcp_statistics.TcpAttemptFails++;
2892 return;
2893 }2894
2895 buff->len += tmp;
2896 t1 =(structtcphdr *)((char *)t1 +tmp);
2897
2898 memcpy(t1, skb->h.th, sizeof(*t1));
2899 buff->h.seq = newsk->write_seq;
2900 /*2901 * Swap the send and the receive. 2902 */2903 t1->dest = skb->h.th->source;
2904 t1->source = newsk->dummy_th.source;
2905 t1->seq = ntohl(newsk->write_seq++);
2906 t1->ack = 1;
2907 newsk->window = tcp_select_window(newsk);
2908 newsk->sent_seq = newsk->write_seq;
2909 t1->window = ntohs(newsk->window);
2910 t1->res1 = 0;
2911 t1->res2 = 0;
2912 t1->rst = 0;
2913 t1->urg = 0;
2914 t1->psh = 0;
2915 t1->syn = 1;
2916 t1->ack_seq = ntohl(skb->h.th->seq+1);
2917 t1->doff = sizeof(*t1)/4+1;
2918 ptr =(unsignedchar *)(t1+1);
2919 ptr[0] = 2;
2920 ptr[1] = 4;
2921 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2922 ptr[3] =(newsk->mtu) & 0xff;
2923
2924 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2925 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2926 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2927 skb->sk = newsk;
2928
2929 /*2930 * Charge the sock_buff to newsk. 2931 */2932
2933 sk->rmem_alloc -= skb->mem_len;
2934 newsk->rmem_alloc += skb->mem_len;
2935
2936 skb_queue_tail(&sk->receive_queue,skb);
2937 sk->ack_backlog++;
2938 release_sock(newsk);
2939 tcp_statistics.TcpOutSegs++;
2940 }2941
2942
2943 staticvoidtcp_close(structsock *sk, inttimeout)
/* */2944 {2945 /*2946 * We need to grab some memory, and put together a FIN, 2947 * and then put it into the queue to be sent.2948 */2949
2950 sk->inuse = 1;
2951
2952 if(sk->state == TCP_LISTEN)
2953 {2954 /* Special case */2955 tcp_set_state(sk, TCP_CLOSE);
2956 tcp_close_pending(sk);
2957 release_sock(sk);
2958 return;
2959 }2960
2961 sk->keepopen = 1;
2962 sk->shutdown = SHUTDOWN_MASK;
2963
2964 if (!sk->dead)
2965 sk->state_change(sk);
2966
2967 if (timeout == 0)
2968 {2969 structsk_buff *skb;
2970
2971 /*2972 * We need to flush the recv. buffs. We do this only on the2973 * descriptor close, not protocol-sourced closes, because the2974 * reader process may not have drained the data yet!2975 */2976
2977 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2978 kfree_skb(skb, FREE_READ);
2979 /*2980 * Get rid off any half-completed packets. 2981 */2982
2983 if (sk->partial)
2984 tcp_send_partial(sk);
2985 }2986
2987
2988 /*2989 * Timeout is not the same thing - however the code likes2990 * to send both the same way (sigh).2991 */2992
2993 if(timeout)
2994 {2995 tcp_set_state(sk, TCP_CLOSE); /* Dead */2996 }2997 else2998 {2999 if(tcp_close_state(sk,1)==1)
3000 {3001 tcp_send_fin(sk);
3002 }3003 }3004 release_sock(sk);
3005 }3006
3007
3008 /*3009 * This routine takes stuff off of the write queue,3010 * and puts it in the xmit queue. This happens as incoming acks3011 * open up the remote window for us.3012 */3013
3014 staticvoidtcp_write_xmit(structsock *sk)
/* */3015 {3016 structsk_buff *skb;
3017
3018 /*3019 * The bytes will have to remain here. In time closedown will3020 * empty the write queue and all will be happy 3021 */3022
3023 if(sk->zapped)
3024 return;
3025
3026 /*3027 * Anything on the transmit queue that fits the window can3028 * be added providing we are not3029 *3030 * a) retransmitting (Nagle's rule)3031 * b) exceeding our congestion window.3032 */3033
3034 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3035 before(skb->h.seq, sk->window_seq + 1) &&
3036 (sk->retransmits == 0 ||
3037 sk->ip_xmit_timeout != TIME_WRITE ||
3038 before(skb->h.seq, sk->rcv_ack_seq + 1))
3039 && sk->packets_out < sk->cong_window)
3040 {3041 IS_SKB(skb);
3042 skb_unlink(skb);
3043
3044 /*3045 * See if we really need to send the packet. 3046 */3047
3048 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3049 {3050 /*3051 * This is acked data. We can discard it. This 3052 * cannot currently occur.3053 */3054
3055 sk->retransmits = 0;
3056 kfree_skb(skb, FREE_WRITE);
3057 if (!sk->dead)
3058 sk->write_space(sk);
3059 }3060 else3061 {3062 structtcphdr *th;
3063 structiphdr *iph;
3064 intsize;
3065 /*3066 * put in the ack seq and window at this point rather than earlier,3067 * in order to keep them monotonic. We really want to avoid taking3068 * back window allocations. That's legal, but RFC1122 says it's frowned on.3069 * Ack and window will in general have changed since this packet was put3070 * on the write queue.3071 */3072 iph = (structiphdr *)(skb->data +
3073 skb->dev->hard_header_len);
3074 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3075 size = skb->len - (((unsignedchar *) th) - skb->data);
3076
3077 th->ack_seq = ntohl(sk->acked_seq);
3078 th->window = ntohs(tcp_select_window(sk));
3079
3080 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3081
3082 sk->sent_seq = skb->h.seq;
3083
3084 /*3085 * IP manages our queue for some crazy reason3086 */3087
3088 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3089
3090 /*3091 * Again we slide the timer wrongly3092 */3093
3094 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3095 }3096 }3097 }3098
3099
3100 /*3101 * This routine deals with incoming acks, but not outgoing ones.3102 */3103
3104 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3105 {3106 unsignedlongack;
3107 intflag = 0;
3108
3109 /* 3110 * 1 - there was data in packet as well as ack or new data is sent or 3111 * in shutdown state3112 * 2 - data from retransmit queue was acked and removed3113 * 4 - window shrunk or data from retransmit queue was acked and removed3114 */3115
3116 if(sk->zapped)
3117 return(1); /* Dead, cant ack any more so why bother */3118
3119 /*3120 * Have we discovered a larger window3121 */3122
3123 ack = ntohl(th->ack_seq);
3124
3125 if (ntohs(th->window) > sk->max_window)
3126 {3127 sk->max_window = ntohs(th->window);
3128 #ifdefCONFIG_INET_PCTCP3129 /* Hack because we don't send partial packets to non SWS3130 handling hosts */3131 sk->mss = min(sk->max_window>>1, sk->mtu);
3132 #else3133 sk->mss = min(sk->max_window, sk->mtu);
3134 #endif3135 }3136
3137 /*3138 * We have dropped back to keepalive timeouts. Thus we have3139 * no retransmits pending.3140 */3141
3142 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3143 sk->retransmits = 0;
3144
3145 /*3146 * If the ack is newer than sent or older than previous acks3147 * then we can probably ignore it.3148 */3149
3150 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3151 {3152 if(sk->debug)
3153 printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3154
3155 /*3156 * Keepalive processing.3157 */3158
3159 if (after(ack, sk->sent_seq))
3160 {3161 return(0);
3162 }3163
3164 /*3165 * Restart the keepalive timer.3166 */3167
3168 if (sk->keepopen)
3169 {3170 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3171 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3172 }3173 return(1);
3174 }3175
3176 /*3177 * If there is data set flag 13178 */3179
3180 if (len != th->doff*4)
3181 flag |= 1;
3182
3183 /*3184 * See if our window has been shrunk. 3185 */3186
3187 if (after(sk->window_seq, ack+ntohs(th->window)))
3188 {3189 /*3190 * We may need to move packets from the send queue3191 * to the write queue, if the window has been shrunk on us.3192 * The RFC says you are not allowed to shrink your window3193 * like this, but if the other end does, you must be able3194 * to deal with it.3195 */3196 structsk_buff *skb;
3197 structsk_buff *skb2;
3198 structsk_buff *wskb = NULL;
3199
3200 skb2 = sk->send_head;
3201 sk->send_head = NULL;
3202 sk->send_tail = NULL;
3203
3204 /*3205 * This is an artifact of a flawed concept. We want one3206 * queue and a smarter send routine when we send all.3207 */3208
3209 flag |= 4; /* Window changed */3210
3211 sk->window_seq = ack + ntohs(th->window);
3212 cli();
3213 while (skb2 != NULL)
3214 {3215 skb = skb2;
3216 skb2 = skb->link3;
3217 skb->link3 = NULL;
3218 if (after(skb->h.seq, sk->window_seq))
3219 {3220 if (sk->packets_out > 0)
3221 sk->packets_out--;
3222 /* We may need to remove this from the dev send list. */3223 if (skb->next != NULL)
3224 {3225 skb_unlink(skb);
3226 }3227 /* Now add it to the write_queue. */3228 if (wskb == NULL)
3229 skb_queue_head(&sk->write_queue,skb);
3230 else3231 skb_append(wskb,skb);
3232 wskb = skb;
3233 }3234 else3235 {3236 if (sk->send_head == NULL)
3237 {3238 sk->send_head = skb;
3239 sk->send_tail = skb;
3240 }3241 else3242 {3243 sk->send_tail->link3 = skb;
3244 sk->send_tail = skb;
3245 }3246 skb->link3 = NULL;
3247 }3248 }3249 sti();
3250 }3251
3252 /*3253 * Pipe has emptied3254 */3255
3256 if (sk->send_tail == NULL || sk->send_head == NULL)
3257 {3258 sk->send_head = NULL;
3259 sk->send_tail = NULL;
3260 sk->packets_out= 0;
3261 }3262
3263 /*3264 * Update the right hand window edge of the host3265 */3266
3267 sk->window_seq = ack + ntohs(th->window);
3268
3269 /*3270 * We don't want too many packets out there. 3271 */3272
3273 if (sk->ip_xmit_timeout == TIME_WRITE &&
3274 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3275 {3276 /* 3277 * This is Jacobson's slow start and congestion avoidance. 3278 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3279 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3280 * counter and increment it once every cwnd times. It's possible3281 * that this should be done only if sk->retransmits == 0. I'm3282 * interpreting "new data is acked" as including data that has3283 * been retransmitted but is just now being acked.3284 */3285 if (sk->cong_window < sk->ssthresh)
3286 /* 3287 * In "safe" area, increase3288 */3289 sk->cong_window++;
3290 else3291 {3292 /*3293 * In dangerous area, increase slowly. In theory this is3294 * sk->cong_window += 1 / sk->cong_window3295 */3296 if (sk->cong_count >= sk->cong_window)
3297 {3298 sk->cong_window++;
3299 sk->cong_count = 0;
3300 }3301 else3302 sk->cong_count++;
3303 }3304 }3305
3306 /*3307 * Remember the highest ack received.3308 */3309
3310 sk->rcv_ack_seq = ack;
3311
3312 /*3313 * If this ack opens up a zero window, clear backoff. It was3314 * being used to time the probes, and is probably far higher than3315 * it needs to be for normal retransmission.3316 */3317
3318 if (sk->ip_xmit_timeout == TIME_PROBE0)
3319 {3320 sk->retransmits = 0; /* Our probe was answered */3321
3322 /*3323 * Was it a usable window open ?3324 */3325
3326 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3327 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3328 {3329 sk->backoff = 0;
3330
3331 /*3332 * Recompute rto from rtt. this eliminates any backoff.3333 */3334
3335 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3336 if (sk->rto > 120*HZ)
3337 sk->rto = 120*HZ;
3338 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3339 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3340 .2 of a second is going to need huge windows (SIGH) */3341 sk->rto = 20;
3342 }3343 }3344
3345 /* 3346 * See if we can take anything off of the retransmit queue.3347 */3348
3349 while(sk->send_head != NULL)
3350 {3351 /* Check for a bug. */3352 if (sk->send_head->link3 &&
3353 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3354 printk("INET: tcp.c: *** bug send_list out of order.\n");
3355
3356 /*3357 * If our packet is before the ack sequence we can3358 * discard it as its confirmed to have arrived the other end.3359 */3360
3361 if (before(sk->send_head->h.seq, ack+1))
3362 {3363 structsk_buff *oskb;
3364 if (sk->retransmits)
3365 {3366 /*3367 * We were retransmitting. don't count this in RTT est 3368 */3369 flag |= 2;
3370
3371 /*3372 * even though we've gotten an ack, we're still3373 * retransmitting as long as we're sending from3374 * the retransmit queue. Keeping retransmits non-zero3375 * prevents us from getting new data interspersed with3376 * retransmissions.3377 */3378
3379 if (sk->send_head->link3) /* Any more queued retransmits? */3380 sk->retransmits = 1;
3381 else3382 sk->retransmits = 0;
3383 }3384 /*3385 * Note that we only reset backoff and rto in the3386 * rtt recomputation code. And that doesn't happen3387 * if there were retransmissions in effect. So the3388 * first new packet after the retransmissions is3389 * sent with the backoff still in effect. Not until3390 * we get an ack from a non-retransmitted packet do3391 * we reset the backoff and rto. This allows us to deal3392 * with a situation where the network delay has increased3393 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3394 */3395
3396 /*3397 * We have one less packet out there. 3398 */3399
3400 if (sk->packets_out > 0)
3401 sk->packets_out --;
3402 /* 3403 * Wake up the process, it can probably write more. 3404 */3405 if (!sk->dead)
3406 sk->write_space(sk);
3407 oskb = sk->send_head;
3408
3409 if (!(flag&2)) /* Not retransmitting */3410 {3411 longm;
3412
3413 /*3414 * The following amusing code comes from Jacobson's3415 * article in SIGCOMM '88. Note that rtt and mdev3416 * are scaled versions of rtt and mean deviation.3417 * This is designed to be as fast as possible 3418 * m stands for "measurement".3419 */3420
3421 m = jiffies - oskb->when; /* RTT */3422 if(m<=0)
3423 m=1; /* IS THIS RIGHT FOR <0 ??? */3424 m -= (sk->rtt >> 3); /* m is now error in rtt est */3425 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3426 if (m < 0)
3427 m = -m; /* m is now abs(error) */3428 m -= (sk->mdev >> 2); /* similar update on mdev */3429 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3430
3431 /*3432 * Now update timeout. Note that this removes any backoff.3433 */3434
3435 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3436 if (sk->rto > 120*HZ)
3437 sk->rto = 120*HZ;
3438 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3439 sk->rto = 20;
3440 sk->backoff = 0;
3441 }3442 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3443 In this case as we just set it up */3444 cli();
3445 oskb = sk->send_head;
3446 IS_SKB(oskb);
3447 sk->send_head = oskb->link3;
3448 if (sk->send_head == NULL)
3449 {3450 sk->send_tail = NULL;
3451 }3452
3453 /*3454 * We may need to remove this from the dev send list. 3455 */3456
3457 if (oskb->next)
3458 skb_unlink(oskb);
3459 sti();
3460 kfree_skb(oskb, FREE_WRITE); /* write. */3461 if (!sk->dead)
3462 sk->write_space(sk);
3463 }3464 else3465 {3466 break;
3467 }3468 }3469
3470 /*3471 * XXX someone ought to look at this too.. at the moment, if skb_peek()3472 * returns non-NULL, we complete ignore the timer stuff in the else3473 * clause. We ought to organize the code so that else clause can3474 * (should) be executed regardless, possibly moving the PROBE timer3475 * reset over. The skb_peek() thing should only move stuff to the3476 * write queue, NOT also manage the timer functions.3477 */3478
3479 /*3480 * Maybe we can take some stuff off of the write queue,3481 * and put it onto the xmit queue.3482 */3483 if (skb_peek(&sk->write_queue) != NULL)
3484 {3485 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3486 (sk->retransmits == 0 ||
3487 sk->ip_xmit_timeout != TIME_WRITE ||
3488 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3489 && sk->packets_out < sk->cong_window)
3490 {3491 /*3492 * Add more data to the send queue.3493 */3494 flag |= 1;
3495 tcp_write_xmit(sk);
3496 }3497 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3498 sk->send_head == NULL &&
3499 sk->ack_backlog == 0 &&
3500 sk->state != TCP_TIME_WAIT)
3501 {3502 /*3503 * Data to queue but no room.3504 */3505 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3506 }3507 }3508 else3509 {3510 /*3511 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3512 * from TCP_CLOSE we don't do anything3513 *3514 * from anything else, if there is write data (or fin) pending,3515 * we use a TIME_WRITE timeout, else if keepalive we reset to3516 * a KEEPALIVE timeout, else we delete the timer.3517 *3518 * We do not set flag for nominal write data, otherwise we may3519 * force a state where we start to write itsy bitsy tidbits3520 * of data.3521 */3522
3523 switch(sk->state) {3524 caseTCP_TIME_WAIT:
3525 /*3526 * keep us in TIME_WAIT until we stop getting packets,3527 * reset the timeout.3528 */3529 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3530 break;
3531 caseTCP_CLOSE:
3532 /*3533 * don't touch the timer.3534 */3535 break;
3536 default:
3537 /*3538 * Must check send_head, write_queue, and ack_backlog3539 * to determine which timeout to use.3540 */3541 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3542 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3543 }elseif (sk->keepopen) {3544 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3545 }else{3546 del_timer(&sk->retransmit_timer);
3547 sk->ip_xmit_timeout = 0;
3548 }3549 break;
3550 }3551 }3552
3553 /*3554 * We have nothing queued but space to send. Send any partial3555 * packets immediately (end of Nagle rule application).3556 */3557
3558 if (sk->packets_out == 0 && sk->partial != NULL &&
3559 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3560 {3561 flag |= 1;
3562 tcp_send_partial(sk);
3563 }3564
3565 /*3566 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3567 * we are now waiting for an acknowledge to our FIN. The other end is3568 * already in TIME_WAIT.3569 *3570 * Move to TCP_CLOSE on success.3571 */3572
3573 if (sk->state == TCP_LAST_ACK)
3574 {3575 if (!sk->dead)
3576 sk->state_change(sk);
3577 if(sk->debug)
3578 printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3579 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3580 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3581 {3582 flag |= 1;
3583 tcp_set_state(sk,TCP_CLOSE);
3584 sk->shutdown = SHUTDOWN_MASK;
3585 }3586 }3587
3588 /*3589 * Incoming ACK to a FIN we sent in the case of our initiating the close.3590 *3591 * Move to FIN_WAIT2 to await a FIN from the other end. Set3592 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3593 */3594
3595 if (sk->state == TCP_FIN_WAIT1)
3596 {3597
3598 if (!sk->dead)
3599 sk->state_change(sk);
3600 if (sk->rcv_ack_seq == sk->write_seq)
3601 {3602 flag |= 1;
3603 sk->shutdown |= SEND_SHUTDOWN;
3604 tcp_set_state(sk, TCP_FIN_WAIT2);
3605 }3606 }3607
3608 /*3609 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3610 *3611 * Move to TIME_WAIT3612 */3613
3614 if (sk->state == TCP_CLOSING)
3615 {3616
3617 if (!sk->dead)
3618 sk->state_change(sk);
3619 if (sk->rcv_ack_seq == sk->write_seq)
3620 {3621 flag |= 1;
3622 tcp_time_wait(sk);
3623 }3624 }3625
3626 /*3627 * Final ack of a three way shake 3628 */3629
3630 if(sk->state==TCP_SYN_RECV)
3631 {3632 tcp_set_state(sk, TCP_ESTABLISHED);
3633 tcp_options(sk,th);
3634 sk->dummy_th.dest=th->source;
3635 sk->copied_seq = sk->acked_seq;
3636 if(!sk->dead)
3637 sk->state_change(sk);
3638 if(sk->max_window==0)
3639 {3640 sk->max_window=32; /* Sanity check */3641 sk->mss=min(sk->max_window,sk->mtu);
3642 }3643 }3644
3645 /*3646 * I make no guarantees about the first clause in the following3647 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3648 * what conditions "!flag" would be true. However I think the rest3649 * of the conditions would prevent that from causing any3650 * unnecessary retransmission. 3651 * Clearly if the first packet has expired it should be 3652 * retransmitted. The other alternative, "flag&2 && retransmits", is3653 * harder to explain: You have to look carefully at how and when the3654 * timer is set and with what timeout. The most recent transmission always3655 * sets the timer. So in general if the most recent thing has timed3656 * out, everything before it has as well. So we want to go ahead and3657 * retransmit some more. If we didn't explicitly test for this3658 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3659 * would not be true. If you look at the pattern of timing, you can3660 * show that rto is increased fast enough that the next packet would3661 * almost never be retransmitted immediately. Then you'd end up3662 * waiting for a timeout to send each packet on the retransmission3663 * queue. With my implementation of the Karn sampling algorithm,3664 * the timeout would double each time. The net result is that it would3665 * take a hideous amount of time to recover from a single dropped packet.3666 * It's possible that there should also be a test for TIME_WRITE, but3667 * I think as long as "send_head != NULL" and "retransmit" is on, we've3668 * got to be in real retransmission mode.3669 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3670 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3671 * As long as no further losses occur, this seems reasonable.3672 */3673
3674 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3675 (((flag&2) && sk->retransmits) ||
3676 (sk->send_head->when + sk->rto < jiffies)))
3677 {3678 if(sk->send_head->when + sk->rto < jiffies)
3679 tcp_retransmit(sk,0);
3680 else3681 {3682 tcp_do_retransmit(sk, 1);
3683 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3684 }3685 }3686
3687 return(1);
3688 }3689
3690
3691 /*3692 * Process the FIN bit. This now behaves as it is supposed to work3693 * and the FIN takes effect when it is validly part of sequence3694 * space. Not before when we get holes.3695 *3696 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3697 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3698 * TIME-WAIT)3699 *3700 * If we are in FINWAIT-1, a received FIN indicates simultaneous3701 * close and we go into CLOSING (and later onto TIME-WAIT)3702 *3703 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3704 *3705 */3706
3707 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3708 {3709 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3710
3711 if (!sk->dead)
3712 {3713 sk->state_change(sk);
3714 sock_wake_async(sk->socket, 1);
3715 }3716
3717 switch(sk->state)
3718 {3719 caseTCP_SYN_RECV:
3720 caseTCP_SYN_SENT:
3721 caseTCP_ESTABLISHED:
3722 /*3723 * move to CLOSE_WAIT, tcp_data() already handled3724 * sending the ack.3725 */3726 tcp_set_state(sk,TCP_CLOSE_WAIT);
3727 if (th->rst)
3728 sk->shutdown = SHUTDOWN_MASK;
3729 break;
3730
3731 caseTCP_CLOSE_WAIT:
3732 caseTCP_CLOSING:
3733 /*3734 * received a retransmission of the FIN, do3735 * nothing.3736 */3737 break;
3738 caseTCP_TIME_WAIT:
3739 /*3740 * received a retransmission of the FIN,3741 * restart the TIME_WAIT timer.3742 */3743 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3744 return(0);
3745 caseTCP_FIN_WAIT1:
3746 /*3747 * This case occurs when a simultaneous close3748 * happens, we must ack the received FIN and3749 * enter the CLOSING state.3750 *3751 * This causes a WRITE timeout, which will either3752 * move on to TIME_WAIT when we timeout, or resend3753 * the FIN properly (maybe we get rid of that annoying3754 * FIN lost hang). The TIME_WRITE code is already correct3755 * for handling this timeout.3756 */3757
3758 if(sk->ip_xmit_timeout != TIME_WRITE)
3759 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3760 tcp_set_state(sk,TCP_CLOSING);
3761 break;
3762 caseTCP_FIN_WAIT2:
3763 /*3764 * received a FIN -- send ACK and enter TIME_WAIT3765 */3766 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3767 sk->shutdown|=SHUTDOWN_MASK;
3768 tcp_set_state(sk,TCP_TIME_WAIT);
3769 break;
3770 caseTCP_CLOSE:
3771 /*3772 * already in CLOSE3773 */3774 break;
3775 default:
3776 tcp_set_state(sk,TCP_LAST_ACK);
3777
3778 /* Start the timers. */3779 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3780 return(0);
3781 }3782
3783 return(0);
3784 }3785
3786
3787
3788 /*3789 * This routine handles the data. If there is room in the buffer,3790 * it will be have already been moved into it. If there is no3791 * room, then we will just have to discard the packet.3792 */3793
3794 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */3795 unsignedlongsaddr, unsignedshortlen)
3796 {3797 structsk_buff *skb1, *skb2;
3798 structtcphdr *th;
3799 intdup_dumped=0;
3800 unsignedlongnew_seq;
3801 unsignedlongshut_seq;
3802
3803 th = skb->h.th;
3804 skb->len = len -(th->doff*4);
3805
3806 /*3807 * The bytes in the receive read/assembly queue has increased. Needed for the3808 * low memory discard algorithm 3809 */3810
3811 sk->bytes_rcv += skb->len;
3812
3813 if (skb->len == 0 && !th->fin && !th->urg && !th->psh)
3814 {3815 /* 3816 * Don't want to keep passing ack's back and forth. 3817 * (someone sent us dataless, boring frame)3818 */3819 if (!th->ack)
3820 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3821 kfree_skb(skb, FREE_READ);
3822 return(0);
3823 }3824
3825 /*3826 * We no longer have anyone receiving data on this connection.3827 */3828
3829 #ifndef TCP_DONT_RST_SHUTDOWN
3830
3831 if(sk->shutdown & RCV_SHUTDOWN)
3832 {3833 /*3834 * FIXME: BSD has some magic to avoid sending resets to3835 * broken 4.2 BSD keepalives. Much to my surprise a few non3836 * BSD stacks still have broken keepalives so we want to3837 * cope with it.3838 */3839
3840 if(skb->len) /* We don't care if its just an ack or3841 a keepalive/window probe */3842 {3843 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */3844
3845 /* Do this the way 4.4BSD treats it. Not what I'd3846 regard as the meaning of the spec but its what BSD3847 does and clearly they know everything 8) */3848
3849 /*3850 * This is valid because of two things3851 *3852 * a) The way tcp_data behaves at the bottom.3853 * b) A fin takes effect when read not when received.3854 */3855
3856 shut_seq=sk->acked_seq+1; /* Last byte */3857
3858 if(after(new_seq,shut_seq))
3859 {3860 if(sk->debug)
3861 printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3862 sk, new_seq, shut_seq, sk->blog);
3863 if(sk->dead)
3864 {3865 sk->acked_seq = new_seq + th->fin;
3866 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3867 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3868 tcp_statistics.TcpEstabResets++;
3869 tcp_set_state(sk,TCP_CLOSE);
3870 sk->err = EPIPE;
3871 sk->shutdown = SHUTDOWN_MASK;
3872 kfree_skb(skb, FREE_READ);
3873 return 0;
3874 }3875 }3876 }3877 }3878
3879 #endif3880
3881 /*3882 * Now we have to walk the chain, and figure out where this one3883 * goes into it. This is set up so that the last packet we received3884 * will be the first one we look at, that way if everything comes3885 * in order, there will be no performance loss, and if they come3886 * out of order we will be able to fit things in nicely.3887 *3888 * [AC: This is wrong. We should assume in order first and then walk3889 * forwards from the first hole based upon real traffic patterns.]3890 * 3891 */3892
3893 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */3894 {3895 skb_queue_head(&sk->receive_queue,skb);
3896 skb1= NULL;
3897 }3898 else3899 {3900 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3901 {3902 if(sk->debug)
3903 {3904 printk("skb1=%p :", skb1);
3905 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3906 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3907 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3908 sk->acked_seq);
3909 }3910
3911 /*3912 * Optimisation: Duplicate frame or extension of previous frame from3913 * same sequence point (lost ack case).3914 * The frame contains duplicate data or replaces a previous frame3915 * discard the previous frame (safe as sk->inuse is set) and put3916 * the new one in its place.3917 */3918
3919 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3920 {3921 skb_append(skb1,skb);
3922 skb_unlink(skb1);
3923 kfree_skb(skb1,FREE_READ);
3924 dup_dumped=1;
3925 skb1=NULL;
3926 break;
3927 }3928
3929 /*3930 * Found where it fits3931 */3932
3933 if (after(th->seq+1, skb1->h.th->seq))
3934 {3935 skb_append(skb1,skb);
3936 break;
3937 }3938
3939 /*3940 * See if we've hit the start. If so insert.3941 */3942 if (skb1 == skb_peek(&sk->receive_queue))
3943 {3944 skb_queue_head(&sk->receive_queue, skb);
3945 break;
3946 }3947 }3948 }3949
3950 /*3951 * Figure out what the ack value for this frame is3952 */3953
3954 th->ack_seq = th->seq + skb->len;
3955 if (th->syn)
3956 th->ack_seq++;
3957 if (th->fin)
3958 th->ack_seq++;
3959
3960 if (before(sk->acked_seq, sk->copied_seq))
3961 {3962 printk("*** tcp.c:tcp_data bug acked < copied\n");
3963 sk->acked_seq = sk->copied_seq;
3964 }3965
3966 /*3967 * Now figure out if we can ack anything. This is very messy because we really want two3968 * receive queues, a completed and an assembly queue. We also want only one transmit3969 * queue.3970 */3971
3972 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3973 {3974 if (before(th->seq, sk->acked_seq+1))
3975 {3976 intnewwindow;
3977
3978 if (after(th->ack_seq, sk->acked_seq))
3979 {3980 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3981 if (newwindow < 0)
3982 newwindow = 0;
3983 sk->window = newwindow;
3984 sk->acked_seq = th->ack_seq;
3985 }3986 skb->acked = 1;
3987
3988 /*3989 * When we ack the fin, we do the FIN 3990 * processing.3991 */3992
3993 if (skb->h.th->fin)
3994 {3995 tcp_fin(skb,sk,skb->h.th);
3996 }3997
3998 for(skb2 = skb->next;
3999 skb2 != (structsk_buff *)&sk->receive_queue;
4000 skb2 = skb2->next)
4001 {4002 if (before(skb2->h.th->seq, sk->acked_seq+1))
4003 {4004 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4005 {4006 newwindow = sk->window -
4007 (skb2->h.th->ack_seq - sk->acked_seq);
4008 if (newwindow < 0)
4009 newwindow = 0;
4010 sk->window = newwindow;
4011 sk->acked_seq = skb2->h.th->ack_seq;
4012 }4013 skb2->acked = 1;
4014 /*4015 * When we ack the fin, we do4016 * the fin handling.4017 */4018 if (skb2->h.th->fin)
4019 {4020 tcp_fin(skb,sk,skb->h.th);
4021 }4022
4023 /*4024 * Force an immediate ack.4025 */4026
4027 sk->ack_backlog = sk->max_ack_backlog;
4028 }4029 else4030 {4031 break;
4032 }4033 }4034
4035 /*4036 * This also takes care of updating the window.4037 * This if statement needs to be simplified.4038 */4039 if (!sk->delay_acks ||
4040 sk->ack_backlog >= sk->max_ack_backlog ||
4041 sk->bytes_rcv > sk->max_unacked || th->fin) {4042 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4043 }4044 else4045 {4046 sk->ack_backlog++;
4047 if(sk->debug)
4048 printk("Ack queued.\n");
4049 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4050 }4051 }4052 }4053
4054 /*4055 * If we've missed a packet, send an ack.4056 * Also start a timer to send another.4057 */4058
4059 if (!skb->acked)
4060 {4061
4062 /*4063 * This is important. If we don't have much room left,4064 * we need to throw out a few packets so we have a good4065 * window. Note that mtu is used, not mss, because mss is really4066 * for the send side. He could be sending us stuff as large as mtu.4067 */4068
4069 while (sk->prot->rspace(sk) < sk->mtu)
4070 {4071 skb1 = skb_peek(&sk->receive_queue);
4072 if (skb1 == NULL)
4073 {4074 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4075 break;
4076 }4077
4078 /*4079 * Don't throw out something that has been acked. 4080 */4081
4082 if (skb1->acked)
4083 {4084 break;
4085 }4086
4087 skb_unlink(skb1);
4088 kfree_skb(skb1, FREE_READ);
4089 }4090 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4091 sk->ack_backlog++;
4092 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4093 }4094 else4095 {4096 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4097 }4098
4099 /*4100 * Now tell the user we may have some data. 4101 */4102
4103 if (!sk->dead)
4104 {4105 if(sk->debug)
4106 printk("Data wakeup.\n");
4107 sk->data_ready(sk,0);
4108 }4109 return(0);
4110 }4111
4112
4113 /*4114 * This routine is only called when we have urgent data4115 * signalled. Its the 'slow' part of tcp_urg. It could be4116 * moved inline now as tcp_urg is only called from one4117 * place. We handle URGent data wrong. We have to - as4118 * BSD still doesn't use the correction from RFC961.4119 */4120
4121 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4122 {4123 unsignedlongptr = ntohs(th->urg_ptr);
4124
4125 if (ptr)
4126 ptr--;
4127 ptr += th->seq;
4128
4129 /* ignore urgent data that we've already seen and read */4130 if (after(sk->copied_seq, ptr))
4131 return;
4132
4133 /* do we already have a newer (or duplicate) urgent pointer? */4134 if (sk->urg_data && !after(ptr, sk->urg_seq))
4135 return;
4136
4137 /* tell the world about our new urgent pointer */4138 if (sk->proc != 0) {4139 if (sk->proc > 0) {4140 kill_proc(sk->proc, SIGURG, 1);
4141 }else{4142 kill_pg(-sk->proc, SIGURG, 1);
4143 }4144 }4145 sk->urg_data = URG_NOTYET;
4146 sk->urg_seq = ptr;
4147 }4148
4149 /*4150 * This is the 'fast' part of urgent handling.4151 */4152
4153 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4154 unsignedlongsaddr, unsignedlonglen)
4155 {4156 unsignedlongptr;
4157
4158 /*4159 * Check if we get a new urgent pointer - normally not 4160 */4161
4162 if (th->urg)
4163 tcp_check_urg(sk,th);
4164
4165 /*4166 * Do we wait for any urgent data? - normally not4167 */4168
4169 if (sk->urg_data != URG_NOTYET)
4170 return 0;
4171
4172 /*4173 * Is the urgent pointer pointing into this packet? 4174 */4175
4176 ptr = sk->urg_seq - th->seq + th->doff*4;
4177 if (ptr >= len)
4178 return 0;
4179
4180 /*4181 * Ok, got the correct packet, update info 4182 */4183
4184 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4185 if (!sk->dead)
4186 sk->data_ready(sk,0);
4187 return 0;
4188 }4189
4190 /*4191 * This will accept the next outstanding connection. 4192 */4193
4194 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4195 {4196 structsock *newsk;
4197 structsk_buff *skb;
4198
4199 /*4200 * We need to make sure that this socket is listening,4201 * and that it has something pending.4202 */4203
4204 if (sk->state != TCP_LISTEN)
4205 {4206 sk->err = EINVAL;
4207 return(NULL);
4208 }4209
4210 /* Avoid the race. */4211 cli();
4212 sk->inuse = 1;
4213
4214 while((skb = tcp_dequeue_established(sk)) == NULL)
4215 {4216 if (flags & O_NONBLOCK)
4217 {4218 sti();
4219 release_sock(sk);
4220 sk->err = EAGAIN;
4221 return(NULL);
4222 }4223
4224 release_sock(sk);
4225 interruptible_sleep_on(sk->sleep);
4226 if (current->signal & ~current->blocked)
4227 {4228 sti();
4229 sk->err = ERESTARTSYS;
4230 return(NULL);
4231 }4232 sk->inuse = 1;
4233 }4234 sti();
4235
4236 /*4237 * Now all we need to do is return skb->sk. 4238 */4239
4240 newsk = skb->sk;
4241
4242 kfree_skb(skb, FREE_READ);
4243 sk->ack_backlog--;
4244 release_sock(sk);
4245 return(newsk);
4246 }4247
4248
4249 /*4250 * This will initiate an outgoing connection. 4251 */4252
4253 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4254 {4255 structsk_buff *buff;
4256 structdevice *dev=NULL;
4257 unsignedchar *ptr;
4258 inttmp;
4259 intatype;
4260 structtcphdr *t1;
4261 structrtable *rt;
4262
4263 if (sk->state != TCP_CLOSE)
4264 {4265 return(-EISCONN);
4266 }4267
4268 if (addr_len < 8)
4269 return(-EINVAL);
4270
4271 if (usin->sin_family && usin->sin_family != AF_INET)
4272 return(-EAFNOSUPPORT);
4273
4274 /*4275 * connect() to INADDR_ANY means loopback (BSD'ism).4276 */4277
4278 if(usin->sin_addr.s_addr==INADDR_ANY)
4279 usin->sin_addr.s_addr=ip_my_addr();
4280
4281 /*4282 * Don't want a TCP connection going to a broadcast address 4283 */4284
4285 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4286 return -ENETUNREACH;
4287
4288 sk->inuse = 1;
4289 sk->daddr = usin->sin_addr.s_addr;
4290 sk->write_seq = tcp_init_seq();
4291 sk->window_seq = sk->write_seq;
4292 sk->rcv_ack_seq = sk->write_seq -1;
4293 sk->err = 0;
4294 sk->dummy_th.dest = usin->sin_port;
4295 release_sock(sk);
4296
4297 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4298 if (buff == NULL)
4299 {4300 return(-ENOMEM);
4301 }4302 sk->inuse = 1;
4303 buff->len = 24;
4304 buff->sk = sk;
4305 buff->free = 0;
4306 buff->localroute = sk->localroute;
4307
4308 t1 = (structtcphdr *) buff->data;
4309
4310 /*4311 * Put in the IP header and routing stuff. 4312 */4313
4314 rt=ip_rt_route(sk->daddr, NULL, NULL);
4315
4316
4317 /*4318 * We need to build the routing stuff from the things saved in skb. 4319 */4320
4321 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4322 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4323 if (tmp < 0)
4324 {4325 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4326 release_sock(sk);
4327 return(-ENETUNREACH);
4328 }4329
4330 buff->len += tmp;
4331 t1 = (structtcphdr *)((char *)t1 +tmp);
4332
4333 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4334 t1->seq = ntohl(sk->write_seq++);
4335 sk->sent_seq = sk->write_seq;
4336 buff->h.seq = sk->write_seq;
4337 t1->ack = 0;
4338 t1->window = 2;
4339 t1->res1=0;
4340 t1->res2=0;
4341 t1->rst = 0;
4342 t1->urg = 0;
4343 t1->psh = 0;
4344 t1->syn = 1;
4345 t1->urg_ptr = 0;
4346 t1->doff = 6;
4347 /* use 512 or whatever user asked for */4348
4349 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4350 sk->window_clamp=rt->rt_window;
4351 else4352 sk->window_clamp=0;
4353
4354 if (sk->user_mss)
4355 sk->mtu = sk->user_mss;
4356 elseif(rt!=NULL && (rt->rt_flags&RTF_MTU))
4357 sk->mtu = rt->rt_mss;
4358 else4359 {4360 #ifdefCONFIG_INET_SNARL4361 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4362 #else4363 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4364 #endif4365 sk->mtu = 576 - HEADER_SIZE;
4366 else4367 sk->mtu = MAX_WINDOW;
4368 }4369 /*4370 * but not bigger than device MTU 4371 */4372
4373 if(sk->mtu <32)
4374 sk->mtu = 32; /* Sanity limit */4375
4376 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4377
4378 /*4379 * Put in the TCP options to say MTU. 4380 */4381
4382 ptr = (unsignedchar *)(t1+1);
4383 ptr[0] = 2;
4384 ptr[1] = 4;
4385 ptr[2] = (sk->mtu) >> 8;
4386 ptr[3] = (sk->mtu) & 0xff;
4387 tcp_send_check(t1, sk->saddr, sk->daddr,
4388 sizeof(structtcphdr) + 4, sk);
4389
4390 /*4391 * This must go first otherwise a really quick response will get reset. 4392 */4393
4394 tcp_set_state(sk,TCP_SYN_SENT);
4395 sk->rto = TCP_TIMEOUT_INIT;
4396 #if 0 /* we already did this */4397 init_timer(&sk->retransmit_timer);
4398 #endif4399 sk->retransmit_timer.function=&retransmit_timer;
4400 sk->retransmit_timer.data = (unsignedlong)sk;
4401 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4402 sk->retransmits = TCP_SYN_RETRIES;
4403
4404 sk->prot->queue_xmit(sk, dev, buff, 0);
4405 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4406 tcp_statistics.TcpActiveOpens++;
4407 tcp_statistics.TcpOutSegs++;
4408
4409 release_sock(sk);
4410 return(0);
4411 }4412
4413
4414 /* This functions checks to see if the tcp header is actually acceptable. */4415 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4416 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4417 {4418 unsignedlongnext_seq;
4419
4420 next_seq = len - 4*th->doff;
4421 if (th->fin)
4422 next_seq++;
4423 /* if we have a zero window, we can't have any data in the packet.. */4424 if (next_seq && !sk->window)
4425 gotoignore_it;
4426 next_seq += th->seq;
4427
4428 /*4429 * This isn't quite right. sk->acked_seq could be more recent4430 * than sk->window. This is however close enough. We will accept4431 * slightly more packets than we should, but it should not cause4432 * problems unless someone is trying to forge packets.4433 */4434
4435 /* have we already seen all of this packet? */4436 if (!after(next_seq+1, sk->acked_seq))
4437 gotoignore_it;
4438 /* or does it start beyond the window? */4439 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4440 gotoignore_it;
4441
4442 /* ok, at least part of this packet would seem interesting.. */4443 return 1;
4444
4445 ignore_it:
4446 if (th->rst)
4447 return 0;
4448
4449 /*4450 * Send a reset if we get something not ours and we are4451 * unsynchronized. Note: We don't do anything to our end. We4452 * are just killing the bogus remote connection then we will4453 * connect again and it will work (with luck).4454 */4455
4456 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4457 {4458 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4459 return 1;
4460 }4461
4462 /* Try to resync things. */4463 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4464 return 0;
4465 }4466
4467 /*4468 * When we get a reset we do this.4469 */4470
4471 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4472 {4473 sk->zapped = 1;
4474 sk->err = ECONNRESET;
4475 if (sk->state == TCP_SYN_SENT)
4476 sk->err = ECONNREFUSED;
4477 if (sk->state == TCP_CLOSE_WAIT)
4478 sk->err = EPIPE;
4479 #ifdef TCP_DO_RFC1337
4480 /*4481 * Time wait assassination protection [RFC1337]4482 */4483 if(sk->state!=TCP_TIME_WAIT)
4484 {4485 tcp_set_state(sk,TCP_CLOSE);
4486 sk->shutdown = SHUTDOWN_MASK;
4487 }4488 #else4489 tcp_set_state(sk,TCP_CLOSE);
4490 sk->shutdown = SHUTDOWN_MASK;
4491 #endif4492 if (!sk->dead)
4493 sk->state_change(sk);
4494 kfree_skb(skb, FREE_READ);
4495 release_sock(sk);
4496 return(0);
4497 }4498
4499 /*4500 * A TCP packet has arrived.4501 */4502
4503 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4504 unsignedlongdaddr, unsignedshortlen,
4505 unsignedlongsaddr, intredo, structinet_protocol * protocol)
4506 {4507 structtcphdr *th;
4508 structsock *sk;
4509 intsyn_ok=0;
4510
4511 if (!skb)
4512 {4513 printk("IMPOSSIBLE 1\n");
4514 return(0);
4515 }4516
4517 if (!dev)
4518 {4519 printk("IMPOSSIBLE 2\n");
4520 return(0);
4521 }4522
4523 tcp_statistics.TcpInSegs++;
4524
4525 if(skb->pkt_type!=PACKET_HOST)
4526 {4527 kfree_skb(skb,FREE_READ);
4528 return(0);
4529 }4530
4531 th = skb->h.th;
4532
4533 /*4534 * Find the socket.4535 */4536
4537 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4538
4539 /*4540 * If this socket has got a reset its to all intents and purposes 4541 * really dead. Count closed sockets as dead.4542 *4543 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4544 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4545 * exist so should cause resets as if the port was unreachable.4546 */4547
4548 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4549 sk=NULL;
4550
4551 if (!redo)
4552 {4553 if (tcp_check(th, len, saddr, daddr ))
4554 {4555 skb->sk = NULL;
4556 kfree_skb(skb,FREE_READ);
4557 /*4558 * We don't release the socket because it was4559 * never marked in use.4560 */4561 return(0);
4562 }4563 th->seq = ntohl(th->seq);
4564
4565 /* See if we know about the socket. */4566 if (sk == NULL)
4567 {4568 /*4569 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4570 */4571 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4572 skb->sk = NULL;
4573 /*4574 * Discard frame4575 */4576 kfree_skb(skb, FREE_READ);
4577 return(0);
4578 }4579
4580 skb->len = len;
4581 skb->acked = 0;
4582 skb->used = 0;
4583 skb->free = 0;
4584 skb->saddr = daddr;
4585 skb->daddr = saddr;
4586
4587 /* We may need to add it to the backlog here. */4588 cli();
4589 if (sk->inuse)
4590 {4591 skb_queue_tail(&sk->back_log, skb);
4592 sti();
4593 return(0);
4594 }4595 sk->inuse = 1;
4596 sti();
4597 }4598 else4599 {4600 if (sk==NULL)
4601 {4602 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4603 skb->sk = NULL;
4604 kfree_skb(skb, FREE_READ);
4605 return(0);
4606 }4607 }4608
4609
4610 if (!sk->prot)
4611 {4612 printk("IMPOSSIBLE 3\n");
4613 return(0);
4614 }4615
4616
4617 /*4618 * Charge the memory to the socket. 4619 */4620
4621 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
4622 {4623 kfree_skb(skb, FREE_READ);
4624 release_sock(sk);
4625 return(0);
4626 }4627
4628 skb->sk=sk;
4629 sk->rmem_alloc += skb->mem_len;
4630
4631 /*4632 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4633 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4634 * compatibility. We also set up variables more thoroughly [Karn notes in the4635 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4636 */4637
4638 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4639 {4640
4641 /*4642 * Now deal with unusual cases.4643 */4644
4645 if(sk->state==TCP_LISTEN)
4646 {4647 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4648 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4649
4650 /*4651 * We don't care for RST, and non SYN are absorbed (old segments)4652 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4653 * netmask on a running connection it can go broadcast. Even Sun's have4654 * this problem so I'm ignoring it 4655 */4656
4657 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4658 {4659 kfree_skb(skb, FREE_READ);
4660 release_sock(sk);
4661 return 0;
4662 }4663
4664 /* 4665 * Guess we need to make a new socket up 4666 */4667
4668 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4669
4670 /*4671 * Now we have several options: In theory there is nothing else4672 * in the frame. KA9Q has an option to send data with the syn,4673 * BSD accepts data with the syn up to the [to be] advertised window4674 * and Solaris 2.1 gives you a protocol error. For now we just ignore4675 * it, that fits the spec precisely and avoids incompatibilities. It4676 * would be nice in future to drop through and process the data.4677 */4678
4679 release_sock(sk);
4680 return 0;
4681 }4682
4683 /* retransmitted SYN? */4684 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4685 {4686 kfree_skb(skb, FREE_READ);
4687 release_sock(sk);
4688 return 0;
4689 }4690
4691 /*4692 * SYN sent means we have to look for a suitable ack and either reset4693 * for bad matches or go to connected 4694 */4695
4696 if(sk->state==TCP_SYN_SENT)
4697 {4698 /* Crossed SYN or previous junk segment */4699 if(th->ack)
4700 {4701 /* We got an ack, but its not a good ack */4702 if(!tcp_ack(sk,th,saddr,len))
4703 {4704 /* Reset the ack - its an ack from a 4705 different connection [ th->rst is checked in tcp_reset()] */4706 tcp_statistics.TcpAttemptFails++;
4707 tcp_reset(daddr, saddr, th,
4708 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4709 kfree_skb(skb, FREE_READ);
4710 release_sock(sk);
4711 return(0);
4712 }4713 if(th->rst)
4714 returntcp_std_reset(sk,skb);
4715 if(!th->syn)
4716 {4717 /* A valid ack from a different connection4718 start. Shouldn't happen but cover it */4719 kfree_skb(skb, FREE_READ);
4720 release_sock(sk);
4721 return 0;
4722 }4723 /*4724 * Ok.. its good. Set up sequence numbers and4725 * move to established.4726 */4727 syn_ok=1; /* Don't reset this connection for the syn */4728 sk->acked_seq=th->seq+1;
4729 sk->fin_seq=th->seq;
4730 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4731 tcp_set_state(sk, TCP_ESTABLISHED);
4732 tcp_options(sk,th);
4733 sk->dummy_th.dest=th->source;
4734 sk->copied_seq = sk->acked_seq;
4735 if(!sk->dead)
4736 {4737 sk->state_change(sk);
4738 sock_wake_async(sk->socket, 0);
4739 }4740 if(sk->max_window==0)
4741 {4742 sk->max_window = 32;
4743 sk->mss = min(sk->max_window, sk->mtu);
4744 }4745 }4746 else4747 {4748 /* See if SYN's cross. Drop if boring */4749 if(th->syn && !th->rst)
4750 {4751 /* Crossed SYN's are fine - but talking to4752 yourself is right out... */4753 if(sk->saddr==saddr && sk->daddr==daddr &&
4754 sk->dummy_th.source==th->source &&
4755 sk->dummy_th.dest==th->dest)
4756 {4757 tcp_statistics.TcpAttemptFails++;
4758 returntcp_std_reset(sk,skb);
4759 }4760 tcp_set_state(sk,TCP_SYN_RECV);
4761
4762 /*4763 * FIXME:4764 * Must send SYN|ACK here4765 */4766 }4767 /* Discard junk segment */4768 kfree_skb(skb, FREE_READ);
4769 release_sock(sk);
4770 return 0;
4771 }4772 /*4773 * SYN_RECV with data maybe.. drop through4774 */4775 gotorfc_step6;
4776 }4777
4778 /*4779 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is4780 * a more complex suggestion for fixing these reuse issues in RFC16444781 * but not yet ready for general use. Also see RFC1379.4782 */4783
4784 #defineBSD_TIME_WAIT4785 #ifdefBSD_TIME_WAIT4786 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4787 after(th->seq, sk->acked_seq) && !th->rst)
4788 {4789 longseq=sk->write_seq;
4790 if(sk->debug)
4791 printk("Doing a BSD time wait\n");
4792 tcp_statistics.TcpEstabResets++;
4793 sk->rmem_alloc -= skb->mem_len;
4794 skb->sk = NULL;
4795 sk->err=ECONNRESET;
4796 tcp_set_state(sk, TCP_CLOSE);
4797 sk->shutdown = SHUTDOWN_MASK;
4798 release_sock(sk);
4799 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4800 if (sk && sk->state==TCP_LISTEN)
4801 {4802 sk->inuse=1;
4803 skb->sk = sk;
4804 sk->rmem_alloc += skb->mem_len;
4805 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4806 release_sock(sk);
4807 return 0;
4808 }4809 kfree_skb(skb, FREE_READ);
4810 return 0;
4811 }4812 #endif4813 }4814
4815 /*4816 * We are now in normal data flow (see the step list in the RFC)4817 * Note most of these are inline now. I'll inline the lot when4818 * I have time to test it hard and look at what gcc outputs 4819 */4820
4821 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4822 {4823 kfree_skb(skb, FREE_READ);
4824 release_sock(sk);
4825 return 0;
4826 }4827
4828 if(th->rst)
4829 returntcp_std_reset(sk,skb);
4830
4831 /*4832 * !syn_ok is effectively the state test in RFC793.4833 */4834
4835 if(th->syn && !syn_ok)
4836 {4837 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4838 returntcp_std_reset(sk,skb);
4839 }4840
4841 /*4842 * Process the ACK4843 */4844
4845
4846 if(th->ack && !tcp_ack(sk,th,saddr,len))
4847 {4848 /*4849 * Our three way handshake failed.4850 */4851
4852 if(sk->state==TCP_SYN_RECV)
4853 {4854 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4855 }4856 kfree_skb(skb, FREE_READ);
4857 release_sock(sk);
4858 return 0;
4859 }4860
4861 rfc_step6: /* I'll clean this up later */4862
4863 /*4864 * Process urgent data4865 */4866
4867 if(tcp_urg(sk, th, saddr, len))
4868 {4869 kfree_skb(skb, FREE_READ);
4870 release_sock(sk);
4871 return 0;
4872 }4873
4874
4875 /*4876 * Process the encapsulated data4877 */4878
4879 if(tcp_data(skb,sk, saddr, len))
4880 {4881 kfree_skb(skb, FREE_READ);
4882 release_sock(sk);
4883 return 0;
4884 }4885
4886 /*4887 * And done4888 */4889
4890 release_sock(sk);
4891 return 0;
4892 }4893
4894 /*4895 * This routine sends a packet with an out of date sequence4896 * number. It assumes the other end will try to ack it.4897 */4898
4899 staticvoidtcp_write_wakeup(structsock *sk)
/* */4900 {4901 structsk_buff *buff;
4902 structtcphdr *t1;
4903 structdevice *dev=NULL;
4904 inttmp;
4905
4906 if (sk->zapped)
4907 return; /* After a valid reset we can send no more */4908
4909 /*4910 * Write data can still be transmitted/retransmitted in the4911 * following states. If any other state is encountered, return.4912 * [listen/close will never occur here anyway]4913 */4914
4915 if (sk->state != TCP_ESTABLISHED &&
4916 sk->state != TCP_CLOSE_WAIT &&
4917 sk->state != TCP_FIN_WAIT1 &&
4918 sk->state != TCP_LAST_ACK &&
4919 sk->state != TCP_CLOSING4920 )
4921 {4922 return;
4923 }4924
4925 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4926 if (buff == NULL)
4927 return;
4928
4929 buff->len = sizeof(structtcphdr);
4930 buff->free = 1;
4931 buff->sk = sk;
4932 buff->localroute = sk->localroute;
4933
4934 t1 = (structtcphdr *) buff->data;
4935
4936 /* Put in the IP header and routing stuff. */4937 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4938 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4939 if (tmp < 0)
4940 {4941 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4942 return;
4943 }4944
4945 buff->len += tmp;
4946 t1 = (structtcphdr *)((char *)t1 +tmp);
4947
4948 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4949
4950 /*4951 * Use a previous sequence.4952 * This should cause the other end to send an ack.4953 */4954
4955 t1->seq = htonl(sk->sent_seq-1);
4956 t1->ack = 1;
4957 t1->res1= 0;
4958 t1->res2= 0;
4959 t1->rst = 0;
4960 t1->urg = 0;
4961 t1->psh = 0;
4962 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */4963 t1->syn = 0;
4964 t1->ack_seq = ntohl(sk->acked_seq);
4965 t1->window = ntohs(tcp_select_window(sk));
4966 t1->doff = sizeof(*t1)/4;
4967 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4968 /*4969 * Send it and free it.4970 * This will prevent the timer from automatically being restarted.4971 */4972 sk->prot->queue_xmit(sk, dev, buff, 1);
4973 tcp_statistics.TcpOutSegs++;
4974 }4975
4976 /*4977 * A window probe timeout has occurred.4978 */4979
4980 voidtcp_send_probe0(structsock *sk)
/* */4981 {4982 if (sk->zapped)
4983 return; /* After a valid reset we can send no more */4984
4985 tcp_write_wakeup(sk);
4986
4987 sk->backoff++;
4988 sk->rto = min(sk->rto << 1, 120*HZ);
4989 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4990 sk->retransmits++;
4991 sk->prot->retransmits ++;
4992 }4993
4994 /*4995 * Socket option code for TCP. 4996 */4997
4998 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */4999 {5000 intval,err;
5001
5002 if(level!=SOL_TCP)
5003 returnip_setsockopt(sk,level,optname,optval,optlen);
5004
5005 if (optval == NULL)
5006 return(-EINVAL);
5007
5008 err=verify_area(VERIFY_READ, optval, sizeof(int));
5009 if(err)
5010 returnerr;
5011
5012 val = get_fs_long((unsignedlong *)optval);
5013
5014 switch(optname)
5015 {5016 caseTCP_MAXSEG:
5017 /*5018 * values greater than interface MTU won't take effect. however at5019 * the point when this call is done we typically don't yet know5020 * which interface is going to be used5021 */5022 if(val<1||val>MAX_WINDOW)
5023 return -EINVAL;
5024 sk->user_mss=val;
5025 return 0;
5026 caseTCP_NODELAY:
5027 sk->nonagle=(val==0)?0:1;
5028 return 0;
5029 default:
5030 return(-ENOPROTOOPT);
5031 }5032 }5033
5034 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5035 {5036 intval,err;
5037
5038 if(level!=SOL_TCP)
5039 returnip_getsockopt(sk,level,optname,optval,optlen);
5040
5041 switch(optname)
5042 {5043 caseTCP_MAXSEG:
5044 val=sk->user_mss;
5045 break;
5046 caseTCP_NODELAY:
5047 val=sk->nonagle;
5048 break;
5049 default:
5050 return(-ENOPROTOOPT);
5051 }5052 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5053 if(err)
5054 returnerr;
5055 put_fs_long(sizeof(int),(unsignedlong *) optlen);
5056
5057 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5058 if(err)
5059 returnerr;
5060 put_fs_long(val,(unsignedlong *)optval);
5061
5062 return(0);
5063 }5064
5065
5066 structprototcp_prot = {5067 sock_wmalloc,
5068 sock_rmalloc,
5069 sock_wfree,
5070 sock_rfree,
5071 sock_rspace,
5072 sock_wspace,
5073 tcp_close,
5074 tcp_read,
5075 tcp_write,
5076 tcp_sendto,
5077 tcp_recvfrom,
5078 ip_build_header,
5079 tcp_connect,
5080 tcp_accept,
5081 ip_queue_xmit,
5082 tcp_retransmit,
5083 tcp_write_wakeup,
5084 tcp_read_wakeup,
5085 tcp_rcv,
5086 tcp_select,
5087 tcp_ioctl,
5088 NULL,
5089 tcp_shutdown,
5090 tcp_setsockopt,
5091 tcp_getsockopt,
5092 128,
5093 0,
5094 {NULL,},
5095 "TCP",
5096 0, 0
5097 };