1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@no.unit.nvg> 20 * 21 * Fixes: 22 * Alan Cox : Numerous verify_area() calls 23 * Alan Cox : Set the ACK bit on a reset 24 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 25 * and was trying to connect (tcp_err()). 26 * Alan Cox : All icmp error handling was broken 27 * pointers passed where wrong and the 28 * socket was looked up backwards. Nobody 29 * tested any icmp error code obviously. 30 * Alan Cox : tcp_err() now handled properly. It wakes people 31 * on errors. select behaves and the icmp error race 32 * has gone by moving it into sock.c 33 * Alan Cox : tcp_reset() fixed to work for everything not just 34 * packets for unknown sockets. 35 * Alan Cox : tcp option processing. 36 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] 37 * Herp Rosmanith : More reset fixes 38 * Alan Cox : No longer acks invalid rst frames. Acking 39 * any kind of RST is right out. 40 * Alan Cox : Sets an ignore me flag on an rst receive 41 * otherwise odd bits of prattle escape still 42 * Alan Cox : Fixed another acking RST frame bug. Should stop 43 * LAN workplace lockups. 44 * Alan Cox : Some tidyups using the new skb list facilities 45 * Alan Cox : sk->keepopen now seems to work 46 * Alan Cox : Pulls options out correctly on accepts 47 * Alan Cox : Fixed assorted sk->rqueue->next errors 48 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. 49 * Alan Cox : Tidied tcp_data to avoid a potential nasty. 50 * Alan Cox : Added some better commenting, as the tcp is hard to follow 51 * Alan Cox : Removed incorrect check for 20 * psh 52 * Michael O'Reilly : ack < copied bug fix. 53 * Johannes Stille : Misc tcp fixes (not all in yet). 54 * Alan Cox : FIN with no memory -> CRASH 55 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. 56 * Alan Cox : Added TCP options (SOL_TCP) 57 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. 58 * Alan Cox : Use ip_tos/ip_ttl settings. 59 * Alan Cox : Handle FIN (more) properly (we hope). 60 * Alan Cox : RST frames sent on unsynchronised state ack error/ 61 * Alan Cox : Put in missing check for SYN bit. 62 * Alan Cox : Added tcp_select_window() aka NET2E 63 * window non shrink trick. 64 * Alan Cox : Added a couple of small NET2E timer fixes 65 * Charles Hedrick : TCP fixes 66 * Toomas Tamm : TCP window fixes 67 * Alan Cox : Small URG fix to rlogin ^C ack fight 68 * Charles Hedrick : Rewrote most of it to actually work 69 * Linus : Rewrote tcp_read() and URG handling 70 * completely 71 * Gerhard Koerting: Fixed some missing timer handling 72 * Matthew Dillon : Reworked TCP machine states as per RFC 73 * Gerhard Koerting: PC/TCP workarounds 74 * Adam Caldwell : Assorted timer/timing errors 75 * Matthew Dillon : Fixed another RST bug 76 * Alan Cox : Move to kernel side addressing changes. 77 * Alan Cox : Beginning work on TCP fastpathing (not yet usable) 78 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 79 * Alan Cox : TCP fast path debugging 80 * Alan Cox : Window clamping 81 * Michael Riepe : Bug in tcp_check() 82 * Matt Dillon : More TCP improvements and RST bug fixes 83 * Matt Dillon : Yet more small nasties remove from the TCP code 84 * (Be very nice to this man if tcp finally works 100%) 8) 85 * Alan Cox : BSD accept semantics. 86 * Alan Cox : Reset on closedown bug. 87 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 88 * Michael Pall : Handle select() after URG properly in all cases. 89 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). 90 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. 91 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. 92 * Alan Cox : Changed the semantics of sk->socket to 93 * fix a race and a signal problem with 94 * accept() and async I/O. 95 * Alan Cox : Relaxed the rules on tcp_sendto(). 96 * Yury Shevchuk : Really fixed accept() blocking problem. 97 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 98 * clients/servers which listen in on 99 * fixed ports. 100 * Alan Cox : Cleaned the above up and shrank it to 101 * a sensible code size. 102 * Alan Cox : Self connect lockup fix. 103 * Alan Cox : No connect to multicast. 104 * Ross Biro : Close unaccepted children on master 105 * socket close. 106 * Alan Cox : Reset tracing code. 107 * Alan Cox : Spurious resets on shutdown. 108 * Alan Cox : Giant 15 minute/60 second timer error 109 * Alan Cox : Small whoops in selecting before an accept. 110 * Alan Cox : Kept the state trace facility since it's 111 * handy for debugging. 112 * Alan Cox : More reset handler fixes. 113 * Alan Cox : Started rewriting the code based on the RFC's 114 * for other useful protocol references see: 115 * Comer, KA9Q NOS, and for a reference on the 116 * difference between specifications and how BSD 117 * works see the 4.4lite source. 118 * A.N.Kuznetsov : Don't time wait on completion of tidy 119 * close. 120 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 121 * Linus Torvalds : Fixed BSD port reuse to work first syn 122 * Alan Cox : Reimplemented timers as per the RFC and using multiple 123 * timers for sanity. 124 * Alan Cox : Small bug fixes, and a lot of new 125 * comments. 126 * Alan Cox : Fixed dual reader crash by locking 127 * the buffers (much like datagram.c) 128 * Alan Cox : Fixed stuck sockets in probe. A probe 129 * now gets fed up of retrying without 130 * (even a no space) answer. 131 * Alan Cox : Extracted closing code better 132 * Alan Cox : Fixed the closing state machine to 133 * resemble the RFC. 134 * Alan Cox : More 'per spec' fixes. 135 * Alan Cox : tcp_data() doesn't ack illegal PSH 136 * only frames. At least one pc tcp stack 137 * generates them. 138 * 139 * 140 * To Fix: 141 * Fast path the code. Two things here - fix the window calculation 142 * so it doesn't iterate over the queue, also spot packets with no funny 143 * options arriving in order and process directly. 144 * 145 * Implement RFC 1191 [Path MTU discovery] 146 * Look at the effect of implementing RFC 1337 suggestions and their impact. 147 * Rewrite output state machine to use a single queue and do low window 148 * situations as per the spec (RFC 1122) 149 * Speed up input assembly algorithm. 150 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 151 * could do with it working on IPv4 152 * User settable/learned rtt/max window/mtu 153 * Cope with MTU/device switches when retransmitting in tcp. 154 * Fix the window handling to use PR's new code. 155 * 156 * Change the fundamental structure to a single send queue maintained 157 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 158 * active routes too]). Cut the queue off in tcp_retransmit/ 159 * tcp_transmit. 160 * Change the receive queue to assemble as it goes. This lets us 161 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 162 * tcp_data/tcp_read as well as the window shrink crud. 163 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 164 * tcp_queue_skb seem obvious routines to extract. 165 * 166 * This program is free software; you can redistribute it and/or 167 * modify it under the terms of the GNU General Public License 168 * as published by the Free Software Foundation; either version 169 * 2 of the License, or(at your option) any later version. 170 * 171 * Description of States: 172 * 173 * TCP_SYN_SENT sent a connection request, waiting for ack 174 * 175 * TCP_SYN_RECV received a connection request, sent ack, 176 * waiting for final ack in three-way handshake. 177 * 178 * TCP_ESTABLISHED connection established 179 * 180 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 181 * transmission of remaining buffered data 182 * 183 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 184 * to shutdown 185 * 186 * TCP_CLOSING both sides have shutdown but we still have 187 * data we have to finish sending 188 * 189 * TCP_TIME_WAIT timeout to catch resent junk before entering 190 * closed, can only be entered from FIN_WAIT2 191 * or CLOSING. Required because the other end 192 * may not have gotten our last ACK causing it 193 * to retransmit the data packet (which we ignore) 194 * 195 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 196 * us to finish writing our data and to shutdown 197 * (we have to close() to move on to LAST_ACK) 198 * 199 * TCP_LAST_ACK out side has shutdown after remote has 200 * shutdown. There may still be data in our 201 * buffer that we have to finish sending 202 * 203 * TCP_CLOSE socket is finished 204 */ 205
206 #include <linux/types.h>
207 #include <linux/sched.h>
208 #include <linux/mm.h>
209 #include <linux/time.h>
210 #include <linux/string.h>
211 #include <linux/config.h>
212 #include <linux/socket.h>
213 #include <linux/sockios.h>
214 #include <linux/termios.h>
215 #include <linux/in.h>
216 #include <linux/fcntl.h>
217 #include <linux/inet.h>
218 #include <linux/netdevice.h>
219 #include "snmp.h"
220 #include "ip.h"
221 #include "protocol.h"
222 #include "icmp.h"
223 #include "tcp.h"
224 #include "arp.h"
225 #include <linux/skbuff.h>
226 #include "sock.h"
227 #include "route.h"
228 #include <linux/errno.h>
229 #include <linux/timer.h>
230 #include <asm/system.h>
231 #include <asm/segment.h>
232 #include <linux/mm.h>
233
234 /* 235 * The MSL timer is the 'normal' timer. 236 */ 237
238 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
239
240 #define SEQ_TICK 3
241 unsignedlongseq_offset;
242 structtcp_mibtcp_statistics;
243
244 staticvoidtcp_close(structsock *sk, inttimeout);
245
246
247 /* 248 * The less said about this the better, but it works and will do for 1.2 249 */ 250
251 staticstructwait_queue *master_select_wakeup;
252
253 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 254 { 255 if (a < b)
256 return(a);
257 return(b);
258 } 259
260 #undefSTATE_TRACE 261
262 #ifdefSTATE_TRACE 263 staticchar *statename[]={ 264 "Unused","Established","Syn Sent","Syn Recv",
265 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
266 "Close Wait","Last ACK","Listen","Closing"
267 };
268 #endif 269
270 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 271 { 272 if(sk->state==TCP_ESTABLISHED)
273 tcp_statistics.TcpCurrEstab--;
274 #ifdefSTATE_TRACE 275 if(sk->debug)
276 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
277 #endif 278 /* This is a hack but it doesn't occur often and it's going to 279 be a real to fix nicely */ 280
281 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
282 { 283 wake_up_interruptible(&master_select_wakeup);
284 } 285 sk->state=state;
286 if(state==TCP_ESTABLISHED)
287 tcp_statistics.TcpCurrEstab++;
288 } 289
290 /* 291 * This routine picks a TCP windows for a socket based on 292 * the following constraints 293 * 294 * 1. The window can never be shrunk once it is offered (RFC 793) 295 * 2. We limit memory per socket 296 * 297 * For now we use NET2E3's heuristic of offering half the memory 298 * we have handy. All is not as bad as this seems however because 299 * of two things. Firstly we will bin packets even within the window 300 * in order to get the data we are waiting for into the memory limit. 301 * Secondly we bin common duplicate forms at receive time 302 * Better heuristics welcome 303 */ 304
305 inttcp_select_window(structsock *sk)
/* */ 306 { 307 intnew_window = sk->prot->rspace(sk);
308
309 if(sk->window_clamp)
310 new_window=min(sk->window_clamp,new_window);
311 /* 312 * Two things are going on here. First, we don't ever offer a 313 * window less than min(sk->mss, MAX_WINDOW/2). This is the 314 * receiver side of SWS as specified in RFC1122. 315 * Second, we always give them at least the window they 316 * had before, in order to avoid retracting window. This 317 * is technically allowed, but RFC1122 advises against it and 318 * in practice it causes trouble. 319 * 320 * Fixme: This doesn't correctly handle the case where 321 * new_window > sk->window but not by enough to allow for the 322 * shift in sequence space. 323 */ 324 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
325 return(sk->window);
326 return(new_window);
327 } 328
329 /* 330 * Find someone to 'accept'. Must be called with 331 * sk->inuse=1 or cli() 332 */ 333
334 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 335 { 336 structsk_buff *p=skb_peek(&s->receive_queue);
337 if(p==NULL)
338 returnNULL;
339 do 340 { 341 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
342 returnp;
343 p=p->next;
344 } 345 while(p!=(structsk_buff *)&s->receive_queue);
346 returnNULL;
347 } 348
349 /* 350 * Remove a completed connection and return it. This is used by 351 * tcp_accept() to get connections from the queue. 352 */ 353
354 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 355 { 356 structsk_buff *skb;
357 unsignedlongflags;
358 save_flags(flags);
359 cli();
360 skb=tcp_find_established(s);
361 if(skb!=NULL)
362 skb_unlink(skb); /* Take it off the queue */ 363 restore_flags(flags);
364 returnskb;
365 } 366
367 /* 368 * This routine closes sockets which have been at least partially 369 * opened, but not yet accepted. Currently it is only called by 370 * tcp_close, and timeout mirrors the value there. 371 */ 372
373 staticvoidtcp_close_pending (structsock *sk)
/* */ 374 { 375 structsk_buff *skb;
376
377 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
378 { 379 skb->sk->dead=1;
380 tcp_close(skb->sk, 0);
381 kfree_skb(skb, FREE_READ);
382 } 383 return;
384 } 385
386 /* 387 * Enter the time wait state. 388 */ 389
390 staticvoidtcp_time_wait(structsock *sk)
/* */ 391 { 392 tcp_set_state(sk,TCP_TIME_WAIT);
393 sk->shutdown = SHUTDOWN_MASK;
394 if (!sk->dead)
395 sk->state_change(sk);
396 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
397 } 398
399 /* 400 * A socket has timed out on its send queue and wants to do a 401 * little retransmitting. Currently this means TCP. 402 */ 403
404 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 405 { 406 structsk_buff * skb;
407 structproto *prot;
408 structdevice *dev;
409 intct=0;
410
411 prot = sk->prot;
412 skb = sk->send_head;
413
414 while (skb != NULL)
415 { 416 structtcphdr *th;
417 structiphdr *iph;
418 intsize;
419
420 dev = skb->dev;
421 IS_SKB(skb);
422 skb->when = jiffies;
423
424 /* 425 * In general it's OK just to use the old packet. However we 426 * need to use the current ack and window fields. Urg and 427 * urg_ptr could possibly stand to be updated as well, but we 428 * don't keep the necessary data. That shouldn't be a problem, 429 * if the other end is doing the right thing. Since we're 430 * changing the packet, we have to issue a new IP identifier. 431 */ 432
433 iph = (structiphdr *)(skb->data + dev->hard_header_len);
434 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
435 size = skb->len - (((unsignedchar *) th) - skb->data);
436
437 /* 438 * Note: We ought to check for window limits here but 439 * currently this is done (less efficiently) elsewhere. 440 * We do need to check for a route change but can't handle 441 * that until we have the new 1.3.x buffers in. 442 * 443 */ 444
445 iph->id = htons(ip_id_count++);
446 ip_send_check(iph);
447
448 /* 449 * This is not the right way to handle this. We have to 450 * issue an up to date window and ack report with this 451 * retransmit to keep the odd buggy tcp that relies on 452 * the fact BSD does this happy. 453 * We don't however need to recalculate the entire 454 * checksum, so someone wanting a small problem to play 455 * with might like to implement RFC1141/RFC1624 and speed 456 * this up by avoiding a full checksum. 457 */ 458
459 th->ack_seq = ntohl(sk->acked_seq);
460 th->window = ntohs(tcp_select_window(sk));
461 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
462
463 /* 464 * If the interface is (still) up and running, kick it. 465 */ 466
467 if (dev->flags & IFF_UP)
468 { 469 /* 470 * If the packet is still being sent by the device/protocol 471 * below then don't retransmit. This is both needed, and good - 472 * especially with connected mode AX.25 where it stops resends 473 * occurring of an as yet unsent anyway frame! 474 * We still add up the counts as the round trip time wants 475 * adjusting. 476 */ 477 if (sk && !skb_device_locked(skb))
478 { 479 /* Remove it from any existing driver queue first! */ 480 skb_unlink(skb);
481 /* Now queue it */ 482 ip_statistics.IpOutRequests++;
483 dev_queue_xmit(skb, dev, sk->priority);
484 } 485 } 486
487 /* 488 * Count retransmissions 489 */ 490
491 ct++;
492 sk->prot->retransmits ++;
493
494 /* 495 * Only one retransmit requested. 496 */ 497
498 if (!all)
499 break;
500
501 /* 502 * This should cut it off before we send too many packets. 503 */ 504
505 if (ct >= sk->cong_window)
506 break;
507 skb = skb->link3;
508 } 509 } 510
511 /* 512 * Reset the retransmission timer 513 */ 514
515 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 516 { 517 del_timer(&sk->retransmit_timer);
518 sk->ip_xmit_timeout = why;
519 if((int)when < 0)
520 { 521 when=3;
522 printk("Error: Negative timer in xmit_timer\n");
523 } 524 sk->retransmit_timer.expires=when;
525 add_timer(&sk->retransmit_timer);
526 } 527
528 /* 529 * This is the normal code called for timeouts. It does the retransmission 530 * and then does backoff. tcp_do_retransmit is separated out because 531 * tcp_ack needs to send stuff from the retransmit queue without 532 * initiating a backoff. 533 */ 534
535
536 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 537 { 538 tcp_do_retransmit(sk, all);
539
540 /* 541 * Increase the timeout each time we retransmit. Note that 542 * we do not increase the rtt estimate. rto is initialized 543 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 544 * that doubling rto each time is the least we can get away with. 545 * In KA9Q, Karn uses this for the first few times, and then 546 * goes to quadratic. netBSD doubles, but only goes up to *64, 547 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 548 * defined in the protocol as the maximum possible RTT. I guess 549 * we'll have to use something other than TCP to talk to the 550 * University of Mars. 551 * 552 * PAWS allows us longer timeouts and large windows, so once 553 * implemented ftp to mars will work nicely. We will have to fix 554 * the 120 second clamps though! 555 */ 556
557 sk->retransmits++;
558 sk->backoff++;
559 sk->rto = min(sk->rto << 1, 120*HZ);
560 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
561 } 562
563
564 /* 565 * A timer event has trigger a tcp retransmit timeout. The 566 * socket xmit queue is ready and set up to send. Because 567 * the ack receive code keeps the queue straight we do 568 * nothing clever here. 569 */ 570
571 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 572 { 573 if (all)
574 { 575 tcp_retransmit_time(sk, all);
576 return;
577 } 578
579 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 580 /* sk->ssthresh in theory can be zero. I guess that's OK */ 581 sk->cong_count = 0;
582
583 sk->cong_window = 1;
584
585 /* Do the actual retransmit. */ 586 tcp_retransmit_time(sk, all);
587 } 588
589 /* 590 * A write timeout has occurred. Process the after effects. 591 */ 592
593 staticinttcp_write_timeout(structsock *sk)
/* */ 594 { 595 /* 596 * Look for a 'soft' timeout. 597 */ 598 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
599 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
600 { 601 /* 602 * Attempt to recover if arp has changed (unlikely!) or 603 * a route has shifted (not supported prior to 1.3). 604 */ 605 arp_destroy (sk->daddr, 0);
606 ip_route_check (sk->daddr);
607 } 608 /* 609 * Has it gone just too far ? 610 */ 611 if (sk->retransmits > TCP_RETR2)
612 { 613 sk->err = ETIMEDOUT;
614 sk->error_report(sk);
615 del_timer(&sk->retransmit_timer);
616 /* 617 * Time wait the socket 618 */ 619 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
620 { 621 tcp_set_state(sk,TCP_TIME_WAIT);
622 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
623 } 624 else 625 { 626 /* 627 * Clean up time. 628 */ 629 tcp_set_state(sk, TCP_CLOSE);
630 return 0;
631 } 632 } 633 return 1;
634 } 635
636 /* 637 * The TCP retransmit timer. This lacks a few small details. 638 * 639 * 1. An initial rtt timeout on the probe0 should cause what we can 640 * of the first write queue buffer to be split and sent. 641 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 642 * ETIMEDOUT if we know an additional 'soft' error caused this. 643 * tcp_err should save a 'soft error' for us. 644 */ 645
646 staticvoidretransmit_timer(unsignedlongdata)
/* */ 647 { 648 structsock *sk = (structsock*)data;
649 intwhy = sk->ip_xmit_timeout;
650
651 /* 652 * only process if socket is not in use 653 */ 654
655 cli();
656 if (sk->inuse || in_bh)
657 { 658 /* Try again in 1 second */ 659 sk->retransmit_timer.expires = HZ;
660 add_timer(&sk->retransmit_timer);
661 sti();
662 return;
663 } 664
665 sk->inuse = 1;
666 sti();
667
668 /* Always see if we need to send an ack. */ 669
670 if (sk->ack_backlog && !sk->zapped)
671 { 672 sk->prot->read_wakeup (sk);
673 if (! sk->dead)
674 sk->data_ready(sk,0);
675 } 676
677 /* Now we need to figure out why the socket was on the timer. */ 678
679 switch (why)
680 { 681 /* Window probing */ 682 caseTIME_PROBE0:
683 tcp_send_probe0(sk);
684 tcp_write_timeout(sk);
685 break;
686 /* Retransmitting */ 687 caseTIME_WRITE:
688 /* It could be we got here because we needed to send an ack. 689 * So we need to check for that. 690 */ 691 { 692 structsk_buff *skb;
693 unsignedlongflags;
694
695 save_flags(flags);
696 cli();
697 skb = sk->send_head;
698 if (!skb)
699 { 700 restore_flags(flags);
701 } 702 else 703 { 704 /* 705 * Kicked by a delayed ack. Reset timer 706 * correctly now 707 */ 708 if (jiffies < skb->when + sk->rto)
709 { 710 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
711 restore_flags(flags);
712 break;
713 } 714 restore_flags(flags);
715 /* 716 * Retransmission 717 */ 718 sk->prot->retransmit (sk, 0);
719 tcp_write_timeout(sk);
720 } 721 break;
722 } 723 /* Sending Keepalives */ 724 caseTIME_KEEPOPEN:
725 /* 726 * this reset_timer() call is a hack, this is not 727 * how KEEPOPEN is supposed to work. 728 */ 729 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
730
731 /* Send something to keep the connection open. */ 732 if (sk->prot->write_wakeup)
733 sk->prot->write_wakeup (sk);
734 sk->retransmits++;
735 tcp_write_timeout(sk);
736 break;
737 default:
738 printk ("rexmit_timer: timer expired - reason unknown\n");
739 break;
740 } 741 release_sock(sk);
742 } 743
744 /* 745 * This routine is called by the ICMP module when it gets some 746 * sort of error condition. If err < 0 then the socket should 747 * be closed and the error returned to the user. If err > 0 748 * it's just the icmp type << 8 | icmp code. After adjustment 749 * header points to the first 8 bytes of the tcp header. We need 750 * to find the appropriate port. 751 */ 752
753 voidtcp_err(interr, unsignedchar *header, unsignedlongdaddr,
/* */ 754 unsignedlongsaddr, structinet_protocol *protocol)
755 { 756 structtcphdr *th;
757 structsock *sk;
758 structiphdr *iph=(structiphdr *)header;
759
760 header+=4*iph->ihl;
761
762
763 th =(structtcphdr *)header;
764 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
765
766 if (sk == NULL)
767 return;
768
769 if(err<0)
770 { 771 sk->err = -err;
772 sk->error_report(sk);
773 return;
774 } 775
776 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
777 { 778 /* 779 * FIXME: 780 * For now we will just trigger a linear backoff. 781 * The slow start code should cause a real backoff here. 782 */ 783 if (sk->cong_window > 4)
784 sk->cong_window--;
785 return;
786 } 787
788 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ 789
790 /* 791 * If we've already connected we will keep trying 792 * until we time out, or the user gives up. 793 */ 794
795 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
796 { 797 if (sk->state == TCP_SYN_SENT)
798 { 799 tcp_statistics.TcpAttemptFails++;
800 tcp_set_state(sk,TCP_CLOSE);
801 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 802 } 803 sk->err = icmp_err_convert[err & 0xff].errno;
804 } 805 return;
806 } 807
808
809 /* 810 * Walk down the receive queue counting readable data until we hit the end or we find a gap 811 * in the received data queue (ie a frame missing that needs sending to us). Not 812 * sorting using two queues as data arrives makes life so much harder. 813 */ 814
815 staticinttcp_readable(structsock *sk)
/* */ 816 { 817 unsignedlongcounted;
818 unsignedlongamount;
819 structsk_buff *skb;
820 intsum;
821 unsignedlongflags;
822
823 if(sk && sk->debug)
824 printk("tcp_readable: %p - ",sk);
825
826 save_flags(flags);
827 cli();
828 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
829 { 830 restore_flags(flags);
831 if(sk && sk->debug)
832 printk("empty\n");
833 return(0);
834 } 835
836 counted = sk->copied_seq; /* Where we are at the moment */ 837 amount = 0;
838
839 /* 840 * Do until a push or until we are out of data. 841 */ 842
843 do 844 { 845 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ 846 break;
847 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 848 if (skb->h.th->syn)
849 sum++;
850 if (sum > 0)
851 {/* Add it up, move on */ 852 amount += sum;
853 if (skb->h.th->syn)
854 amount--;
855 counted += sum;
856 } 857 /* 858 * Don't count urg data ... but do it in the right place! 859 * Consider: "old_data (ptr is here) URG PUSH data" 860 * The old code would stop at the first push because 861 * it counted the urg (amount==1) and then does amount-- 862 * *after* the loop. This means tcp_readable() always 863 * returned zero if any URG PUSH was in the queue, even 864 * though there was normal data available. If we subtract 865 * the urg data right here, we even get it to work for more 866 * than one URG PUSH skb without normal data. 867 * This means that select() finally works now with urg data 868 * in the queue. Note that rlogin was never affected 869 * because it doesn't use select(); it uses two processes 870 * and a blocking read(). And the queue scan in tcp_read() 871 * was correct. Mike <pall@rz.uni-karlsruhe.de> 872 */ 873 if (skb->h.th->urg)
874 amount--; /* don't count urg data */ 875 if (amount && skb->h.th->psh) break;
876 skb = skb->next;
877 } 878 while(skb != (structsk_buff *)&sk->receive_queue);
879
880 restore_flags(flags);
881 if(sk->debug)
882 printk("got %lu bytes.\n",amount);
883 return(amount);
884 } 885
886 /* 887 * LISTEN is a special case for select.. 888 */ 889 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 890 { 891 if (sel_type == SEL_IN) { 892 intretval;
893
894 sk->inuse = 1;
895 retval = (tcp_find_established(sk) != NULL);
896 release_sock(sk);
897 if (!retval)
898 select_wait(&master_select_wakeup,wait);
899 returnretval;
900 } 901 return 0;
902 } 903
904
905 /* 906 * Wait for a TCP event. 907 * 908 * Note that we don't need to set "sk->inuse", as the upper select layers 909 * take care of normal races (between the test and the event) and we don't 910 * go look at any of the socket buffers directly. 911 */ 912 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 913 { 914 if (sk->state == TCP_LISTEN)
915 returntcp_listen_select(sk, sel_type, wait);
916
917 switch(sel_type) { 918 caseSEL_IN:
919 if (sk->err)
920 return 1;
921 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
922 break;
923
924 if (sk->shutdown & RCV_SHUTDOWN)
925 return 1;
926
927 if (sk->acked_seq == sk->copied_seq)
928 break;
929
930 if (sk->urg_seq != sk->copied_seq ||
931 sk->acked_seq != sk->copied_seq+1 ||
932 sk->urginline || !sk->urg_data)
933 return 1;
934 break;
935
936 caseSEL_OUT:
937 if (sk->shutdown & SEND_SHUTDOWN)
938 return 0;
939 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
940 break;
941 /* 942 * This is now right thanks to a small fix 943 * by Matt Dillon. 944 */ 945
946 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
947 break;
948 return 1;
949
950 caseSEL_EX:
951 if (sk->err || sk->urg_data)
952 return 1;
953 break;
954 } 955 select_wait(sk->sleep, wait);
956 return 0;
957 } 958
959 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 960 { 961 interr;
962 switch(cmd)
963 { 964
965 caseTIOCINQ:
966 #ifdef FIXME /* FIXME: */ 967 caseFIONREAD:
968 #endif 969 { 970 unsignedlongamount;
971
972 if (sk->state == TCP_LISTEN)
973 return(-EINVAL);
974
975 sk->inuse = 1;
976 amount = tcp_readable(sk);
977 release_sock(sk);
978 err=verify_area(VERIFY_WRITE,(void *)arg,
979 sizeof(unsignedlong));
980 if(err)
981 returnerr;
982 put_fs_long(amount,(unsignedlong *)arg);
983 return(0);
984 } 985 caseSIOCATMARK:
986 { 987 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
988
989 err = verify_area(VERIFY_WRITE,(void *) arg,
990 sizeof(unsignedlong));
991 if (err)
992 returnerr;
993 put_fs_long(answ,(int *) arg);
994 return(0);
995 } 996 caseTIOCOUTQ:
997 { 998 unsignedlongamount;
999
1000 if (sk->state == TCP_LISTEN) return(-EINVAL);
1001 amount = sk->prot->wspace(sk);
1002 err=verify_area(VERIFY_WRITE,(void *)arg,
1003 sizeof(unsignedlong));
1004 if(err)
1005 returnerr;
1006 put_fs_long(amount,(unsignedlong *)arg);
1007 return(0);
1008 }1009 default:
1010 return(-EINVAL);
1011 }1012 }1013
1014
1015 /*1016 * This routine computes a TCP checksum. 1017 */1018
1019 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1020 unsignedlongsaddr, unsignedlongdaddr)
1021 {1022 unsignedlongsum;
1023
1024 if (saddr == 0) saddr = ip_my_addr();
1025
1026 /*1027 * stupid, gcc complains when I use just one __asm__ block,1028 * something about too many reloads, but this is just two1029 * instructions longer than what I want1030 */1031 __asm__("
1032 addl %%ecx, %%ebx
1033 adcl %%edx, %%ebx
1034 adcl $0, %%ebx
1035 "
1036 : "=b"(sum)
1037 : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1038 : "bx", "cx", "dx" );
1039 __asm__("
1040 movl %%ecx, %%edx
1041 cld
1042 cmpl $32, %%ecx
1043 jb 2f
1044 shrl $5, %%ecx
1045 clc
1046 1: lodsl
1047 adcl %%eax, %%ebx
1048 lodsl
1049 adcl %%eax, %%ebx
1050 lodsl
1051 adcl %%eax, %%ebx
1052 lodsl
1053 adcl %%eax, %%ebx
1054 lodsl
1055 adcl %%eax, %%ebx
1056 lodsl
1057 adcl %%eax, %%ebx
1058 lodsl
1059 adcl %%eax, %%ebx
1060 lodsl
1061 adcl %%eax, %%ebx
1062 loop 1b
1063 adcl $0, %%ebx
1064 movl %%edx, %%ecx
1065 2: andl $28, %%ecx
1066 je 4f
1067 shrl $2, %%ecx
1068 clc
1069 3: lodsl
1070 adcl %%eax, %%ebx
1071 loop 3b
1072 adcl $0, %%ebx
1073 4: movl $0, %%eax
1074 testw $2, %%dx
1075 je 5f
1076 lodsw
1077 addl %%eax, %%ebx
1078 adcl $0, %%ebx
1079 movw $0, %%ax
1080 5: test $1, %%edx
1081 je 6f
1082 lodsb
1083 addl %%eax, %%ebx
1084 adcl $0, %%ebx
1085 6: movl %%ebx, %%eax
1086 shrl $16, %%eax
1087 addw %%ax, %%bx
1088 adcw $0, %%bx
1089 "
1090 : "=b"(sum)
1091 : "0"(sum), "c"(len), "S"(th)
1092 : "ax", "bx", "cx", "dx", "si" );
1093
1094 /* We only want the bottom 16 bits, but we never cleared the top 16. */1095
1096 return((~sum) & 0xffff);
1097 }1098
1099
1100
1101 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1102 unsignedlongdaddr, intlen, structsock *sk)
1103 {1104 th->check = 0;
1105 th->check = tcp_check(th, len, saddr, daddr);
1106 return;
1107 }1108
1109 /*1110 * This is the main buffer sending routine. We queue the buffer1111 * having checked it is sane seeming.1112 */1113
1114 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1115 {1116 intsize;
1117 structtcphdr * th = skb->h.th;
1118
1119 /*1120 * length of packet (not counting length of pre-tcp headers) 1121 */1122
1123 size = skb->len - ((unsignedchar *) th - skb->data);
1124
1125 /*1126 * Sanity check it.. 1127 */1128
1129 if (size < sizeof(structtcphdr) || size > skb->len)
1130 {1131 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1132 skb, skb->data, th, skb->len);
1133 kfree_skb(skb, FREE_WRITE);
1134 return;
1135 }1136
1137 /*1138 * If we have queued a header size packet.. (these crash a few1139 * tcp stacks if ack is not set)1140 */1141
1142 if (size == sizeof(structtcphdr))
1143 {1144 /* If it's got a syn or fin it's notionally included in the size..*/1145 if(!th->syn && !th->fin)
1146 {1147 printk("tcp_send_skb: attempt to queue a bogon.\n");
1148 kfree_skb(skb,FREE_WRITE);
1149 return;
1150 }1151 }1152
1153 /*1154 * Actual processing.1155 */1156
1157 tcp_statistics.TcpOutSegs++;
1158 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1159
1160 /*1161 * We must queue if1162 *1163 * a) The right edge of this frame exceeds the window1164 * b) We are retransmitting (Nagle's rule)1165 * c) We have too many packets 'in flight'1166 */1167
1168 if (after(skb->h.seq, sk->window_seq) ||
1169 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1170 sk->packets_out >= sk->cong_window)
1171 {1172 /* checksum will be supplied by tcp_write_xmit. So1173 * we shouldn't need to set it at all. I'm being paranoid */1174 th->check = 0;
1175 if (skb->next != NULL)
1176 {1177 printk("tcp_send_partial: next != NULL\n");
1178 skb_unlink(skb);
1179 }1180 skb_queue_tail(&sk->write_queue, skb);
1181
1182 /*1183 * If we don't fit we have to start the zero window1184 * probes. This is broken - we really need to do a partial1185 * send _first_ (This is what causes the Cisco and PC/TCP1186 * grief).1187 */1188
1189 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1190 sk->send_head == NULL && sk->ack_backlog == 0)
1191 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1192 }1193 else1194 {1195 /*1196 * This is going straight out1197 */1198
1199 th->ack_seq = ntohl(sk->acked_seq);
1200 th->window = ntohs(tcp_select_window(sk));
1201
1202 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1203
1204 sk->sent_seq = sk->write_seq;
1205
1206 /*1207 * This is mad. The tcp retransmit queue is put together1208 * by the ip layer. This causes half the problems with1209 * unroutable FIN's and other things.1210 */1211
1212 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1213
1214 /*1215 * Set for next retransmit based on expected ACK time.1216 * FIXME: We set this every time which means our 1217 * retransmits are really about a window behind.1218 */1219
1220 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1221 }1222 }1223
1224 /*1225 * Locking problems lead us to a messy situation where we can have1226 * multiple partially complete buffers queued up. This is really bad1227 * as we don't want to be sending partial buffers. Fix this with1228 * a semaphore or similar to lock tcp_write per socket.1229 *1230 * These routines are pretty self descriptive.1231 */1232
1233 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1234 {1235 structsk_buff * skb;
1236 unsignedlongflags;
1237
1238 save_flags(flags);
1239 cli();
1240 skb = sk->partial;
1241 if (skb) {1242 sk->partial = NULL;
1243 del_timer(&sk->partial_timer);
1244 }1245 restore_flags(flags);
1246 returnskb;
1247 }1248
1249 /*1250 * Empty the partial queue1251 */1252
1253 staticvoidtcp_send_partial(structsock *sk)
/* */1254 {1255 structsk_buff *skb;
1256
1257 if (sk == NULL)
1258 return;
1259 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1260 tcp_send_skb(sk, skb);
1261 }1262
1263 /*1264 * Queue a partial frame1265 */1266
1267 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1268 {1269 structsk_buff * tmp;
1270 unsignedlongflags;
1271
1272 save_flags(flags);
1273 cli();
1274 tmp = sk->partial;
1275 if (tmp)
1276 del_timer(&sk->partial_timer);
1277 sk->partial = skb;
1278 init_timer(&sk->partial_timer);
1279 /*1280 * Wait up to 1 second for the buffer to fill.1281 */1282 sk->partial_timer.expires = HZ;
1283 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1284 sk->partial_timer.data = (unsignedlong) sk;
1285 add_timer(&sk->partial_timer);
1286 restore_flags(flags);
1287 if (tmp)
1288 tcp_send_skb(sk, tmp);
1289 }1290
1291
1292 /*1293 * This routine sends an ack and also updates the window. 1294 */1295
1296 staticvoidtcp_send_ack(unsignedlongsequence, unsignedlongack,
/* */1297 structsock *sk,
1298 structtcphdr *th, unsignedlongdaddr)
1299 {1300 structsk_buff *buff;
1301 structtcphdr *t1;
1302 structdevice *dev = NULL;
1303 inttmp;
1304
1305 if(sk->zapped)
1306 return; /* We have been reset, we may not send again */1307
1308 /*1309 * We need to grab some memory, and put together an ack,1310 * and then put it into the queue to be sent.1311 */1312
1313 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1314 if (buff == NULL)
1315 {1316 /* 1317 * Force it to send an ack. We don't have to do this1318 * (ACK is unreliable) but it's much better use of 1319 * bandwidth on slow links to send a spare ack than1320 * resend packets. 1321 */1322
1323 sk->ack_backlog++;
1324 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1325 {1326 reset_xmit_timer(sk, TIME_WRITE, HZ);
1327 }1328 return;
1329 }1330
1331 /*1332 * Assemble a suitable TCP frame1333 */1334
1335 buff->len = sizeof(structtcphdr);
1336 buff->sk = sk;
1337 buff->localroute = sk->localroute;
1338 t1 =(structtcphdr *) buff->data;
1339
1340 /* 1341 * Put in the IP header and routing stuff. 1342 */1343
1344 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1345 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1346 if (tmp < 0)
1347 {1348 buff->free = 1;
1349 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1350 return;
1351 }1352 buff->len += tmp;
1353 t1 =(structtcphdr *)((char *)t1 +tmp);
1354
1355 memcpy(t1, th, sizeof(*t1));
1356
1357 /*1358 * Swap the send and the receive. 1359 */1360
1361 t1->dest = th->source;
1362 t1->source = th->dest;
1363 t1->seq = ntohl(sequence);
1364 t1->ack = 1;
1365 sk->window = tcp_select_window(sk);
1366 t1->window = ntohs(sk->window);
1367 t1->res1 = 0;
1368 t1->res2 = 0;
1369 t1->rst = 0;
1370 t1->urg = 0;
1371 t1->syn = 0;
1372 t1->psh = 0;
1373 t1->fin = 0;
1374
1375 /*1376 * If we have nothing queued for transmit and the transmit timer1377 * is on we are just doing an ACK timeout and need to switch1378 * to a keepalive.1379 */1380
1381 if (ack == sk->acked_seq)
1382 {1383 sk->ack_backlog = 0;
1384 sk->bytes_rcv = 0;
1385 sk->ack_timed = 0;
1386 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1387 && sk->ip_xmit_timeout == TIME_WRITE)
1388 {1389 if(sk->keepopen) {1390 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1391 }else{1392 delete_timer(sk);
1393 }1394 }1395 }1396
1397 /*1398 * Fill in the packet and send it1399 */1400
1401 t1->ack_seq = ntohl(ack);
1402 t1->doff = sizeof(*t1)/4;
1403 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1404 if (sk->debug)
1405 printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1406 tcp_statistics.TcpOutSegs++;
1407 sk->prot->queue_xmit(sk, dev, buff, 1);
1408 }1409
1410
1411 /* 1412 * This routine builds a generic TCP header. 1413 */1414
1415 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1416 {1417
1418 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1419 th->seq = htonl(sk->write_seq);
1420 th->psh =(push == 0) ? 1 : 0;
1421 th->doff = sizeof(*th)/4;
1422 th->ack = 1;
1423 th->fin = 0;
1424 sk->ack_backlog = 0;
1425 sk->bytes_rcv = 0;
1426 sk->ack_timed = 0;
1427 th->ack_seq = htonl(sk->acked_seq);
1428 sk->window = tcp_select_window(sk);
1429 th->window = htons(sk->window);
1430
1431 return(sizeof(*th));
1432 }1433
1434 /*1435 * This routine copies from a user buffer into a socket,1436 * and starts the transmit system.1437 */1438
1439 staticinttcp_write(structsock *sk, unsignedchar *from,
/* */1440 intlen, intnonblock, unsignedflags)
1441 {1442 intcopied = 0;
1443 intcopy;
1444 inttmp;
1445 structsk_buff *skb;
1446 structsk_buff *send_tmp;
1447 unsignedchar *buff;
1448 structproto *prot;
1449 structdevice *dev = NULL;
1450
1451 sk->inuse=1;
1452 prot = sk->prot;
1453 while(len > 0)
1454 {1455 if (sk->err)
1456 {/* Stop on an error */1457 release_sock(sk);
1458 if (copied)
1459 return(copied);
1460 tmp = -sk->err;
1461 sk->err = 0;
1462 return(tmp);
1463 }1464
1465 /*1466 * First thing we do is make sure that we are established. 1467 */1468
1469 if (sk->shutdown & SEND_SHUTDOWN)
1470 {1471 release_sock(sk);
1472 sk->err = EPIPE;
1473 if (copied)
1474 return(copied);
1475 sk->err = 0;
1476 return(-EPIPE);
1477 }1478
1479 /* 1480 * Wait for a connection to finish.1481 */1482
1483 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1484 {1485 if (sk->err)
1486 {1487 release_sock(sk);
1488 if (copied)
1489 return(copied);
1490 tmp = -sk->err;
1491 sk->err = 0;
1492 return(tmp);
1493 }1494
1495 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1496 {1497 release_sock(sk);
1498 if (copied)
1499 return(copied);
1500
1501 if (sk->err)
1502 {1503 tmp = -sk->err;
1504 sk->err = 0;
1505 return(tmp);
1506 }1507
1508 if (sk->keepopen)
1509 {1510 send_sig(SIGPIPE, current, 0);
1511 }1512 return(-EPIPE);
1513 }1514
1515 if (nonblock || copied)
1516 {1517 release_sock(sk);
1518 if (copied)
1519 return(copied);
1520 return(-EAGAIN);
1521 }1522
1523 release_sock(sk);
1524 cli();
1525
1526 if (sk->state != TCP_ESTABLISHED &&
1527 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1528 {1529 interruptible_sleep_on(sk->sleep);
1530 if (current->signal & ~current->blocked)
1531 {1532 sti();
1533 if (copied)
1534 return(copied);
1535 return(-ERESTARTSYS);
1536 }1537 }1538 sk->inuse = 1;
1539 sti();
1540 }1541
1542 /*1543 * The following code can result in copy <= if sk->mss is ever1544 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1545 * sk->mtu is constant once SYN processing is finished. I.e. we1546 * had better not get here until we've seen his SYN and at least one1547 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1548 * But ESTABLISHED should guarantee that. sk->max_window is by definition1549 * non-decreasing. Note that any ioctl to set user_mss must be done1550 * before the exchange of SYN's. If the initial ack from the other1551 * end has a window of 0, max_window and thus mss will both be 0.1552 */1553
1554 /* 1555 * Now we need to check if we have a half built packet. 1556 */1557
1558 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1559 {1560 inthdrlen;
1561
1562 /* IP header + TCP header */1563 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1564 + sizeof(structtcphdr);
1565
1566 /* Add more stuff to the end of skb->len */1567 if (!(flags & MSG_OOB))
1568 {1569 copy = min(sk->mss - (skb->len - hdrlen), len);
1570 /* FIXME: this is really a bug. */1571 if (copy <= 0)
1572 {1573 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1574 copy = 0;
1575 }1576
1577 memcpy_fromfs(skb->data + skb->len, from, copy);
1578 skb->len += copy;
1579 from += copy;
1580 copied += copy;
1581 len -= copy;
1582 sk->write_seq += copy;
1583 }1584 if ((skb->len - hdrlen) >= sk->mss ||
1585 (flags & MSG_OOB) || !sk->packets_out)
1586 tcp_send_skb(sk, skb);
1587 else1588 tcp_enqueue_partial(skb, sk);
1589 continue;
1590 }1591
1592 /*1593 * We also need to worry about the window.1594 * If window < 1/2 the maximum window we've seen from this1595 * host, don't use it. This is sender side1596 * silly window prevention, as specified in RFC1122.1597 * (Note that this is different than earlier versions of1598 * SWS prevention, e.g. RFC813.). What we actually do is 1599 * use the whole MSS. Since the results in the right1600 * edge of the packet being outside the window, it will1601 * be queued for later rather than sent.1602 */1603
1604 copy = sk->window_seq - sk->write_seq;
1605 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1606 copy = sk->mss;
1607 if (copy > len)
1608 copy = len;
1609
1610 /*1611 * We should really check the window here also. 1612 */1613
1614 send_tmp = NULL;
1615 if (copy < sk->mss && !(flags & MSG_OOB))
1616 {1617 /*1618 * We will release the socket in case we sleep here. 1619 */1620 release_sock(sk);
1621 /*1622 * NB: following must be mtu, because mss can be increased.1623 * mss is always <= mtu 1624 */1625 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1626 sk->inuse = 1;
1627 send_tmp = skb;
1628 }1629 else1630 {1631 /*1632 * We will release the socket in case we sleep here. 1633 */1634 release_sock(sk);
1635 skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1636 sk->inuse = 1;
1637 }1638
1639 /*1640 * If we didn't get any memory, we need to sleep. 1641 */1642
1643 if (skb == NULL)
1644 {1645 sk->socket->flags |= SO_NOSPACE;
1646 if (nonblock)
1647 {1648 release_sock(sk);
1649 if (copied)
1650 return(copied);
1651 return(-EAGAIN);
1652 }1653
1654 /*1655 * FIXME: here is another race condition. 1656 */1657
1658 tmp = sk->wmem_alloc;
1659 release_sock(sk);
1660 cli();
1661 /*1662 * Again we will try to avoid it. 1663 */1664 if (tmp <= sk->wmem_alloc &&
1665 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1666 && sk->err == 0)
1667 {1668 sk->socket->flags &= ~SO_NOSPACE;
1669 interruptible_sleep_on(sk->sleep);
1670 if (current->signal & ~current->blocked)
1671 {1672 sti();
1673 if (copied)
1674 return(copied);
1675 return(-ERESTARTSYS);
1676 }1677 }1678 sk->inuse = 1;
1679 sti();
1680 continue;
1681 }1682
1683 skb->len = 0;
1684 skb->sk = sk;
1685 skb->free = 0;
1686 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1687
1688 buff = skb->data;
1689
1690 /*1691 * FIXME: we need to optimize this.1692 * Perhaps some hints here would be good.1693 */1694
1695 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1696 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1697 if (tmp < 0 )
1698 {1699 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1700 release_sock(sk);
1701 if (copied)
1702 return(copied);
1703 return(tmp);
1704 }1705 skb->len += tmp;
1706 skb->dev = dev;
1707 buff += tmp;
1708 skb->h.th =(structtcphdr *) buff;
1709 tmp = tcp_build_header((structtcphdr *)buff, sk, len-copy);
1710 if (tmp < 0)
1711 {1712 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1713 release_sock(sk);
1714 if (copied)
1715 return(copied);
1716 return(tmp);
1717 }1718
1719 if (flags & MSG_OOB)
1720 {1721 ((structtcphdr *)buff)->urg = 1;
1722 ((structtcphdr *)buff)->urg_ptr = ntohs(copy);
1723 }1724 skb->len += tmp;
1725 memcpy_fromfs(buff+tmp, from, copy);
1726
1727 from += copy;
1728 copied += copy;
1729 len -= copy;
1730 skb->len += copy;
1731 skb->free = 0;
1732 sk->write_seq += copy;
1733
1734 if (send_tmp != NULL && sk->packets_out)
1735 {1736 tcp_enqueue_partial(send_tmp, sk);
1737 continue;
1738 }1739 tcp_send_skb(sk, skb);
1740 }1741 sk->err = 0;
1742
1743 /*1744 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1745 * interactive fast network servers. It's meant to be on and1746 * it really improves the throughput though not the echo time1747 * on my slow slip link - Alan1748 */1749
1750 /*1751 * Avoid possible race on send_tmp - c/o Johannes Stille 1752 */1753
1754 if(sk->partial && ((!sk->packets_out)
1755 /* If not nagling we can send on the before case too.. */1756 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1757 ))
1758 tcp_send_partial(sk);
1759
1760 release_sock(sk);
1761 return(copied);
1762 }1763
1764 /*1765 * This is just a wrapper. 1766 */1767
1768 staticinttcp_sendto(structsock *sk, unsignedchar *from,
/* */1769 intlen, intnonblock, unsignedflags,
1770 structsockaddr_in *addr, intaddr_len)
1771 {1772 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1773 return -EINVAL;
1774 if (sk->state == TCP_CLOSE)
1775 return -ENOTCONN;
1776 if (addr_len < sizeof(*addr))
1777 return -EINVAL;
1778 if (addr->sin_family && addr->sin_family != AF_INET)
1779 return -EINVAL;
1780 if (addr->sin_port != sk->dummy_th.dest)
1781 return -EISCONN;
1782 if (addr->sin_addr.s_addr != sk->daddr)
1783 return -EISCONN;
1784 returntcp_write(sk, from, len, nonblock, flags);
1785 }1786
1787
1788 /*1789 * Send an ack if one is backlogged at this point. Ought to merge1790 * this with tcp_send_ack().1791 */1792
1793 staticvoidtcp_read_wakeup(structsock *sk)
/* */1794 {1795 inttmp;
1796 structdevice *dev = NULL;
1797 structtcphdr *t1;
1798 structsk_buff *buff;
1799
1800 if (!sk->ack_backlog)
1801 return;
1802
1803 /*1804 * FIXME: we need to put code here to prevent this routine from1805 * being called. Being called once in a while is ok, so only check1806 * if this is the second time in a row.1807 */1808
1809 /*1810 * We need to grab some memory, and put together an ack,1811 * and then put it into the queue to be sent.1812 */1813
1814 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1815 if (buff == NULL)
1816 {1817 /* Try again real soon. */1818 reset_xmit_timer(sk, TIME_WRITE, HZ);
1819 return;
1820 }1821
1822 buff->len = sizeof(structtcphdr);
1823 buff->sk = sk;
1824 buff->localroute = sk->localroute;
1825
1826 /*1827 * Put in the IP header and routing stuff. 1828 */1829
1830 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1831 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1832 if (tmp < 0)
1833 {1834 buff->free = 1;
1835 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1836 return;
1837 }1838
1839 buff->len += tmp;
1840 t1 =(structtcphdr *)(buff->data +tmp);
1841
1842 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1843 t1->seq = htonl(sk->sent_seq);
1844 t1->ack = 1;
1845 t1->res1 = 0;
1846 t1->res2 = 0;
1847 t1->rst = 0;
1848 t1->urg = 0;
1849 t1->syn = 0;
1850 t1->psh = 0;
1851 sk->ack_backlog = 0;
1852 sk->bytes_rcv = 0;
1853 sk->window = tcp_select_window(sk);
1854 t1->window = ntohs(sk->window);
1855 t1->ack_seq = ntohl(sk->acked_seq);
1856 t1->doff = sizeof(*t1)/4;
1857 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1858 sk->prot->queue_xmit(sk, dev, buff, 1);
1859 tcp_statistics.TcpOutSegs++;
1860 }1861
1862
1863 /*1864 * FIXME:1865 * This routine frees used buffers.1866 * It should consider sending an ACK to let the1867 * other end know we now have a bigger window.1868 */1869
1870 staticvoidcleanup_rbuf(structsock *sk)
/* */1871 {1872 unsignedlongflags;
1873 unsignedlongleft;
1874 structsk_buff *skb;
1875 unsignedlongrspace;
1876
1877 if(sk->debug)
1878 printk("cleaning rbuf for sk=%p\n", sk);
1879
1880 save_flags(flags);
1881 cli();
1882
1883 left = sk->prot->rspace(sk);
1884
1885 /*1886 * We have to loop through all the buffer headers,1887 * and try to free up all the space we can.1888 */1889
1890 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1891 {1892 if (!skb->used || skb->users)
1893 break;
1894 skb_unlink(skb);
1895 skb->sk = sk;
1896 kfree_skb(skb, FREE_READ);
1897 }1898
1899 restore_flags(flags);
1900
1901 /*1902 * FIXME:1903 * At this point we should send an ack if the difference1904 * in the window, and the amount of space is bigger than1905 * TCP_WINDOW_DIFF.1906 */1907
1908 if(sk->debug)
1909 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1910 left);
1911 if ((rspace=sk->prot->rspace(sk)) != left)
1912 {1913 /*1914 * This area has caused the most trouble. The current strategy1915 * is to simply do nothing if the other end has room to send at1916 * least 3 full packets, because the ack from those will auto-1917 * matically update the window. If the other end doesn't think1918 * we have much space left, but we have room for at least 1 more1919 * complete packet than it thinks we do, we will send an ack1920 * immediately. Otherwise we will wait up to .5 seconds in case1921 * the user reads some more.1922 */1923 sk->ack_backlog++;
1924 /*1925 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1926 * if the other end is offering a window smaller than the agreed on MSS1927 * (called sk->mtu here). In theory there's no connection between send1928 * and receive, and so no reason to think that they're going to send1929 * small packets. For the moment I'm using the hack of reducing the mss1930 * only on the send side, so I'm putting mtu here.1931 */1932
1933 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1934 {1935 /* Send an ack right now. */1936 tcp_read_wakeup(sk);
1937 }1938 else1939 {1940 /* Force it to send an ack soon. */1941 intwas_active = del_timer(&sk->retransmit_timer);
1942 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1943 {1944 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1945 }1946 else1947 add_timer(&sk->retransmit_timer);
1948 }1949 }1950 }1951
1952
1953 /*1954 * Handle reading urgent data. BSD has very simple semantics for1955 * this, no blocking and very strange errors 8)1956 */1957
1958 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */1959 unsignedchar *to, intlen, unsignedflags)
1960 {1961 /*1962 * No URG data to read1963 */1964 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1965 return -EINVAL; /* Yes this is right ! */1966
1967 if (sk->err)
1968 {1969 inttmp = -sk->err;
1970 sk->err = 0;
1971 returntmp;
1972 }1973
1974 if (sk->state == TCP_CLOSE || sk->done)
1975 {1976 if (!sk->done) {1977 sk->done = 1;
1978 return 0;
1979 }1980 return -ENOTCONN;
1981 }1982
1983 if (sk->shutdown & RCV_SHUTDOWN)
1984 {1985 sk->done = 1;
1986 return 0;
1987 }1988 sk->inuse = 1;
1989 if (sk->urg_data & URG_VALID)
1990 {1991 charc = sk->urg_data;
1992 if (!(flags & MSG_PEEK))
1993 sk->urg_data = URG_READ;
1994 put_fs_byte(c, to);
1995 release_sock(sk);
1996 return 1;
1997 }1998 release_sock(sk);
1999
2000 /*2001 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2002 * the available implementations agree in this case:2003 * this call should never block, independent of the2004 * blocking state of the socket.2005 * Mike <pall@rz.uni-karlsruhe.de>2006 */2007 return -EAGAIN;
2008 }2009
2010
2011 /*2012 * This routine copies from a sock struct into the user buffer. 2013 */2014
2015 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */2016 intlen, intnonblock, unsignedflags)
2017 {2018 structwait_queuewait = {current, NULL};
2019 intcopied = 0;
2020 unsignedlongpeek_seq;
2021 volatileunsignedlong *seq; /* So gcc doesn't overoptimise */2022 unsignedlongused;
2023
2024 /* 2025 * This error should be checked. 2026 */2027
2028 if (sk->state == TCP_LISTEN)
2029 return -ENOTCONN;
2030
2031 /*2032 * Urgent data needs to be handled specially. 2033 */2034
2035 if (flags & MSG_OOB)
2036 returntcp_read_urg(sk, nonblock, to, len, flags);
2037
2038 /*2039 * Copying sequence to update. This is volatile to handle2040 * the multi-reader case neatly (memcpy_to/fromfs might be 2041 * inline and thus not flush cached variables otherwise).2042 */2043
2044 peek_seq = sk->copied_seq;
2045 seq = &sk->copied_seq;
2046 if (flags & MSG_PEEK)
2047 seq = &peek_seq;
2048
2049 add_wait_queue(sk->sleep, &wait);
2050 sk->inuse = 1;
2051 while (len > 0)
2052 {2053 structsk_buff * skb;
2054 unsignedlongoffset;
2055
2056 /*2057 * Are we at urgent data? Stop if we have read anything.2058 */2059
2060 if (copied && sk->urg_data && sk->urg_seq == *seq)
2061 break;
2062
2063 /*2064 * Next get a buffer.2065 */2066
2067 current->state = TASK_INTERRUPTIBLE;
2068
2069 skb = skb_peek(&sk->receive_queue);
2070 do2071 {2072 if (!skb)
2073 break;
2074 if (before(*seq, skb->h.th->seq))
2075 break;
2076 offset = *seq - skb->h.th->seq;
2077 if (skb->h.th->syn)
2078 offset--;
2079 if (offset < skb->len)
2080 gotofound_ok_skb;
2081 if (skb->h.th->fin)
2082 gotofound_fin_ok;
2083 if (!(flags & MSG_PEEK))
2084 skb->used = 1;
2085 skb = skb->next;
2086 }2087 while (skb != (structsk_buff *)&sk->receive_queue);
2088
2089 if (copied)
2090 break;
2091
2092 if (sk->err)
2093 {2094 copied = -sk->err;
2095 sk->err = 0;
2096 break;
2097 }2098
2099 if (sk->state == TCP_CLOSE)
2100 {2101 if (!sk->done)
2102 {2103 sk->done = 1;
2104 break;
2105 }2106 copied = -ENOTCONN;
2107 break;
2108 }2109
2110 if (sk->shutdown & RCV_SHUTDOWN)
2111 {2112 sk->done = 1;
2113 break;
2114 }2115
2116 if (nonblock)
2117 {2118 copied = -EAGAIN;
2119 break;
2120 }2121
2122 cleanup_rbuf(sk);
2123 release_sock(sk);
2124 sk->socket->flags |= SO_WAITDATA;
2125 schedule();
2126 sk->socket->flags &= ~SO_WAITDATA;
2127 sk->inuse = 1;
2128
2129 if (current->signal & ~current->blocked)
2130 {2131 copied = -ERESTARTSYS;
2132 break;
2133 }2134 continue;
2135
2136 found_ok_skb:
2137 /*2138 * Lock the buffer. We can be fairly relaxed as2139 * an interrupt will never steal a buffer we are 2140 * using unless I've missed something serious in2141 * tcp_data.2142 */2143
2144 skb->users++;
2145
2146 /*2147 * Ok so how much can we use ? 2148 */2149
2150 used = skb->len - offset;
2151 if (len < used)
2152 used = len;
2153 /*2154 * Do we have urgent data here? 2155 */2156
2157 if (sk->urg_data)
2158 {2159 unsignedlongurg_offset = sk->urg_seq - *seq;
2160 if (urg_offset < used)
2161 {2162 if (!urg_offset)
2163 {2164 if (!sk->urginline)
2165 {2166 ++*seq;
2167 offset++;
2168 used--;
2169 }2170 }2171 else2172 used = urg_offset;
2173 }2174 }2175
2176 /*2177 * Copy it - We _MUST_ update *seq first so that we2178 * don't ever double read when we have dual readers2179 */2180
2181 *seq += used;
2182
2183 /*2184 * This memcpy_tofs can sleep. If it sleeps and we2185 * do a second read it relies on the skb->users to avoid2186 * a crash when cleanup_rbuf() gets called.2187 */2188
2189 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2190 skb->h.th->doff*4 + offset, used);
2191 copied += used;
2192 len -= used;
2193 to += used;
2194
2195 /*2196 * We now will not sleep again until we are finished2197 * with skb. Sorry if you are doing the SMP port2198 * but you'll just have to fix it neatly ;)2199 */2200
2201 skb->users --;
2202
2203 if (after(sk->copied_seq,sk->urg_seq))
2204 sk->urg_data = 0;
2205 if (used + offset < skb->len)
2206 continue;
2207
2208 /*2209 * Process the FIN.2210 */2211
2212 if (skb->h.th->fin)
2213 gotofound_fin_ok;
2214 if (flags & MSG_PEEK)
2215 continue;
2216 skb->used = 1;
2217 continue;
2218
2219 found_fin_ok:
2220 ++*seq;
2221 if (flags & MSG_PEEK)
2222 break;
2223
2224 /*2225 * All is done2226 */2227
2228 skb->used = 1;
2229 sk->shutdown |= RCV_SHUTDOWN;
2230 break;
2231
2232 }2233 remove_wait_queue(sk->sleep, &wait);
2234 current->state = TASK_RUNNING;
2235
2236 /* Clean up data we have read: This will do ACK frames */2237 cleanup_rbuf(sk);
2238 release_sock(sk);
2239 returncopied;
2240 }2241
2242 /*2243 * State processing on a close. This implements the state shift for2244 * sending our FIN frame. Note that we only send a FIN for some 2245 * states. A shutdown() may have already sent the FIN, or we may be2246 * closed.2247 */2248
2249 staticinttcp_close_state(structsock *sk, intdead)
/* */2250 {2251 intns=TCP_CLOSE;
2252 intsend_fin=0;
2253 switch(sk->state)
2254 {2255 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2256 break;
2257 caseTCP_SYN_RECV:
2258 caseTCP_ESTABLISHED: /* Closedown begin */2259 ns=TCP_FIN_WAIT1;
2260 send_fin=1;
2261 break;
2262 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2263 caseTCP_FIN_WAIT2:
2264 caseTCP_CLOSING:
2265 ns=sk->state;
2266 break;
2267 caseTCP_CLOSE:
2268 caseTCP_LISTEN:
2269 break;
2270 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2271 wait only for the ACK */2272 ns=TCP_LAST_ACK;
2273 send_fin=1;
2274 }2275
2276 tcp_set_state(sk,ns);
2277
2278 /*2279 * This is a (useful) BSD violating of the RFC. There is a2280 * problem with TCP as specified in that the other end could2281 * keep a socket open forever with no application left this end.2282 * We use a 3 minute timeout (about the same as BSD) then kill2283 * our end. If they send after that then tough - BUT: long enough2284 * that we won't make the old 4*rto = almost no time - whoops2285 * reset mistake.2286 */2287 if(dead && ns==TCP_FIN_WAIT2)
2288 {2289 inttimer_active=del_timer(&sk->timer);
2290 if(timer_active)
2291 add_timer(&sk->timer);
2292 else2293 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2294 }2295
2296 returnsend_fin;
2297 }2298
2299 /*2300 * Send a fin.2301 */2302
2303 staticvoidtcp_send_fin(structsock *sk)
/* */2304 {2305 structproto *prot =(structproto *)sk->prot;
2306 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2307 structtcphdr *t1;
2308 structsk_buff *buff;
2309 structdevice *dev=NULL;
2310 inttmp;
2311
2312 release_sock(sk); /* in case the malloc sleeps. */2313
2314 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2315 sk->inuse = 1;
2316
2317 if (buff == NULL)
2318 {2319 /* This is a disaster if it occurs */2320 printk("tcp_send_fin: Impossible malloc failure");
2321 return;
2322 }2323
2324 /*2325 * Administrivia2326 */2327
2328 buff->sk = sk;
2329 buff->len = sizeof(*t1);
2330 buff->localroute = sk->localroute;
2331 t1 =(structtcphdr *) buff->data;
2332
2333 /*2334 * Put in the IP header and routing stuff. 2335 */2336
2337 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2338 IPPROTO_TCP, sk->opt,
2339 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2340 if (tmp < 0)
2341 {2342 intt;
2343 /*2344 * Finish anyway, treat this as a send that got lost. 2345 * (Not good).2346 */2347
2348 buff->free = 1;
2349 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2350 sk->write_seq++;
2351 t=del_timer(&sk->timer);
2352 if(t)
2353 add_timer(&sk->timer);
2354 else2355 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2356 return;
2357 }2358
2359 /*2360 * We ought to check if the end of the queue is a buffer and2361 * if so simply add the fin to that buffer, not send it ahead.2362 */2363
2364 t1 =(structtcphdr *)((char *)t1 +tmp);
2365 buff->len += tmp;
2366 buff->dev = dev;
2367 memcpy(t1, th, sizeof(*t1));
2368 t1->seq = ntohl(sk->write_seq);
2369 sk->write_seq++;
2370 buff->h.seq = sk->write_seq;
2371 t1->ack = 1;
2372 t1->ack_seq = ntohl(sk->acked_seq);
2373 t1->window = ntohs(sk->window=tcp_select_window(sk));
2374 t1->fin = 1;
2375 t1->rst = 0;
2376 t1->doff = sizeof(*t1)/4;
2377 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2378
2379 /*2380 * If there is data in the write queue, the fin must be appended to2381 * the write queue.2382 */2383
2384 if (skb_peek(&sk->write_queue) != NULL)
2385 {2386 buff->free = 0;
2387 if (buff->next != NULL)
2388 {2389 printk("tcp_send_fin: next != NULL\n");
2390 skb_unlink(buff);
2391 }2392 skb_queue_tail(&sk->write_queue, buff);
2393 }2394 else2395 {2396 sk->sent_seq = sk->write_seq;
2397 sk->prot->queue_xmit(sk, dev, buff, 0);
2398 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2399 }2400 }2401
2402 /*2403 * Shutdown the sending side of a connection. Much like close except2404 * that we don't receive shut down or set sk->dead=1.2405 */2406
2407 voidtcp_shutdown(structsock *sk, inthow)
/* */2408 {2409 /*2410 * We need to grab some memory, and put together a FIN,2411 * and then put it into the queue to be sent.2412 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2413 */2414
2415 if (!(how & SEND_SHUTDOWN))
2416 return;
2417
2418 /*2419 * If we've already sent a FIN, or it's a closed state2420 */2421
2422 if (sk->state == TCP_FIN_WAIT1 ||
2423 sk->state == TCP_FIN_WAIT2 ||
2424 sk->state == TCP_CLOSING ||
2425 sk->state == TCP_LAST_ACK ||
2426 sk->state == TCP_TIME_WAIT ||
2427 sk->state == TCP_CLOSE ||
2428 sk->state == TCP_LISTEN2429 )
2430 {2431 return;
2432 }2433 sk->inuse = 1;
2434
2435 /*2436 * flag that the sender has shutdown2437 */2438
2439 sk->shutdown |= SEND_SHUTDOWN;
2440
2441 /*2442 * Clear out any half completed packets. 2443 */2444
2445 if (sk->partial)
2446 tcp_send_partial(sk);
2447
2448 /*2449 * FIN if needed2450 */2451
2452 if(tcp_close_state(sk,0))
2453 tcp_send_fin(sk);
2454
2455 release_sock(sk);
2456 }2457
2458
2459 staticint2460 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2461 intto_len, intnonblock, unsignedflags,
2462 structsockaddr_in *addr, int *addr_len)
2463 {2464 intresult;
2465
2466 /* 2467 * Have to check these first unlike the old code. If 2468 * we check them after we lose data on an error2469 * which is wrong 2470 */2471
2472 if(addr_len)
2473 *addr_len = sizeof(*addr);
2474 result=tcp_read(sk, to, to_len, nonblock, flags);
2475
2476 if (result < 0)
2477 return(result);
2478
2479 if(addr)
2480 {2481 addr->sin_family = AF_INET;
2482 addr->sin_port = sk->dummy_th.dest;
2483 addr->sin_addr.s_addr = sk->daddr;
2484 }2485 return(result);
2486 }2487
2488
2489 /*2490 * This routine will send an RST to the other tcp. 2491 */2492
2493 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2494 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2495 {2496 structsk_buff *buff;
2497 structtcphdr *t1;
2498 inttmp;
2499 structdevice *ndev=NULL;
2500
2501 /*2502 * Cannot reset a reset (Think about it).2503 */2504
2505 if(th->rst)
2506 return;
2507
2508 /*2509 * We need to grab some memory, and put together an RST,2510 * and then put it into the queue to be sent.2511 */2512
2513 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2514 if (buff == NULL)
2515 return;
2516
2517 buff->len = sizeof(*t1);
2518 buff->sk = NULL;
2519 buff->dev = dev;
2520 buff->localroute = 0;
2521
2522 t1 =(structtcphdr *) buff->data;
2523
2524 /*2525 * Put in the IP header and routing stuff. 2526 */2527
2528 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2529 sizeof(structtcphdr),tos,ttl);
2530 if (tmp < 0)
2531 {2532 buff->free = 1;
2533 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2534 return;
2535 }2536
2537 t1 =(structtcphdr *)((char *)t1 +tmp);
2538 buff->len += tmp;
2539 memcpy(t1, th, sizeof(*t1));
2540
2541 /*2542 * Swap the send and the receive. 2543 */2544
2545 t1->dest = th->source;
2546 t1->source = th->dest;
2547 t1->rst = 1;
2548 t1->window = 0;
2549
2550 if(th->ack)
2551 {2552 t1->ack = 0;
2553 t1->seq = th->ack_seq;
2554 t1->ack_seq = 0;
2555 }2556 else2557 {2558 t1->ack = 1;
2559 if(!th->syn)
2560 t1->ack_seq=htonl(th->seq);
2561 else2562 t1->ack_seq=htonl(th->seq+1);
2563 t1->seq=0;
2564 }2565
2566 t1->syn = 0;
2567 t1->urg = 0;
2568 t1->fin = 0;
2569 t1->psh = 0;
2570 t1->doff = sizeof(*t1)/4;
2571 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2572 prot->queue_xmit(NULL, ndev, buff, 1);
2573 tcp_statistics.TcpOutSegs++;
2574 }2575
2576
2577 /*2578 * Look for tcp options. Parses everything but only knows about MSS.2579 * This routine is always called with the packet containing the SYN.2580 * However it may also be called with the ack to the SYN. So you2581 * can't assume this is always the SYN. It's always called after2582 * we have set up sk->mtu to our own MTU.2583 *2584 * We need at minimum to add PAWS support here. Possibly large windows2585 * as Linux gets deployed on 100Mb/sec networks.2586 */2587
2588 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2589 {2590 unsignedchar *ptr;
2591 intlength=(th->doff*4)-sizeof(structtcphdr);
2592 intmss_seen = 0;
2593
2594 ptr = (unsignedchar *)(th + 1);
2595
2596 while(length>0)
2597 {2598 intopcode=*ptr++;
2599 intopsize=*ptr++;
2600 switch(opcode)
2601 {2602 caseTCPOPT_EOL:
2603 return;
2604 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2605 length--;
2606 ptr--; /* the opsize=*ptr++ above was a mistake */2607 continue;
2608
2609 default:
2610 if(opsize<=2) /* Avoid silly options looping forever */2611 return;
2612 switch(opcode)
2613 {2614 caseTCPOPT_MSS:
2615 if(opsize==4 && th->syn)
2616 {2617 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2618 mss_seen = 1;
2619 }2620 break;
2621 /* Add other options here as people feel the urge to implement stuff like large windows */2622 }2623 ptr+=opsize-2;
2624 length-=opsize;
2625 }2626 }2627 if (th->syn)
2628 {2629 if (! mss_seen)
2630 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2631 }2632 #ifdefCONFIG_INET_PCTCP2633 sk->mss = min(sk->max_window >> 1, sk->mtu);
2634 #else2635 sk->mss = min(sk->max_window, sk->mtu);
2636 #endif2637 }2638
2639 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2640 {2641 dst = ntohl(dst);
2642 if (IN_CLASSA(dst))
2643 returnhtonl(IN_CLASSA_NET);
2644 if (IN_CLASSB(dst))
2645 returnhtonl(IN_CLASSB_NET);
2646 returnhtonl(IN_CLASSC_NET);
2647 }2648
2649 /*2650 * Default sequence number picking algorithm.2651 * As close as possible to RFC 793, which2652 * suggests using a 250kHz clock.2653 * Further reading shows this assumes 2MB/s networks.2654 * For 10MB/s ethernet, a 1MHz clock is appropriate.2655 * That's funny, Linux has one built in! Use it!2656 */2657
2658 externinlineunsignedlongtcp_init_seq(void)
/* */2659 {2660 structtimevaltv;
2661 do_gettimeofday(&tv);
2662 returntv.tv_usec+tv.tv_sec*1000000;
2663 }2664
2665 /*2666 * This routine handles a connection request.2667 * It should make sure we haven't already responded.2668 * Because of the way BSD works, we have to send a syn/ack now.2669 * This also means it will be harder to close a socket which is2670 * listening.2671 */2672
2673 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2674 unsignedlongdaddr, unsignedlongsaddr,
2675 structoptions *opt, structdevice *dev, unsignedlongseq)
2676 {2677 structsk_buff *buff;
2678 structtcphdr *t1;
2679 unsignedchar *ptr;
2680 structsock *newsk;
2681 structtcphdr *th;
2682 structdevice *ndev=NULL;
2683 inttmp;
2684 structrtable *rt;
2685
2686 th = skb->h.th;
2687
2688 /* If the socket is dead, don't accept the connection. */2689 if (!sk->dead)
2690 {2691 sk->data_ready(sk,0);
2692 }2693 else2694 {2695 if(sk->debug)
2696 printk("Reset on %p: Connect on dead socket.\n",sk);
2697 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2698 tcp_statistics.TcpAttemptFails++;
2699 kfree_skb(skb, FREE_READ);
2700 return;
2701 }2702
2703 /*2704 * Make sure we can accept more. This will prevent a2705 * flurry of syns from eating up all our memory.2706 */2707
2708 if (sk->ack_backlog >= sk->max_ack_backlog)
2709 {2710 tcp_statistics.TcpAttemptFails++;
2711 kfree_skb(skb, FREE_READ);
2712 return;
2713 }2714
2715 /*2716 * We need to build a new sock struct.2717 * It is sort of bad to have a socket without an inode attached2718 * to it, but the wake_up's will just wake up the listening socket,2719 * and if the listening socket is destroyed before this is taken2720 * off of the queue, this will take care of it.2721 */2722
2723 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2724 if (newsk == NULL)
2725 {2726 /* just ignore the syn. It will get retransmitted. */2727 tcp_statistics.TcpAttemptFails++;
2728 kfree_skb(skb, FREE_READ);
2729 return;
2730 }2731
2732 memcpy(newsk, sk, sizeof(*newsk));
2733 skb_queue_head_init(&newsk->write_queue);
2734 skb_queue_head_init(&newsk->receive_queue);
2735 newsk->send_head = NULL;
2736 newsk->send_tail = NULL;
2737 skb_queue_head_init(&newsk->back_log);
2738 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2739 newsk->rto = TCP_TIMEOUT_INIT;
2740 newsk->mdev = 0;
2741 newsk->max_window = 0;
2742 newsk->cong_window = 1;
2743 newsk->cong_count = 0;
2744 newsk->ssthresh = 0;
2745 newsk->backoff = 0;
2746 newsk->blog = 0;
2747 newsk->intr = 0;
2748 newsk->proc = 0;
2749 newsk->done = 0;
2750 newsk->partial = NULL;
2751 newsk->pair = NULL;
2752 newsk->wmem_alloc = 0;
2753 newsk->rmem_alloc = 0;
2754 newsk->localroute = sk->localroute;
2755
2756 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2757
2758 newsk->err = 0;
2759 newsk->shutdown = 0;
2760 newsk->ack_backlog = 0;
2761 newsk->acked_seq = skb->h.th->seq+1;
2762 newsk->copied_seq = skb->h.th->seq+1;
2763 newsk->fin_seq = skb->h.th->seq;
2764 newsk->state = TCP_SYN_RECV;
2765 newsk->timeout = 0;
2766 newsk->ip_xmit_timeout = 0;
2767 newsk->write_seq = seq;
2768 newsk->window_seq = newsk->write_seq;
2769 newsk->rcv_ack_seq = newsk->write_seq;
2770 newsk->urg_data = 0;
2771 newsk->retransmits = 0;
2772 newsk->linger=0;
2773 newsk->destroy = 0;
2774 init_timer(&newsk->timer);
2775 newsk->timer.data = (unsignedlong)newsk;
2776 newsk->timer.function = &net_timer;
2777 init_timer(&newsk->retransmit_timer);
2778 newsk->retransmit_timer.data = (unsignedlong)newsk;
2779 newsk->retransmit_timer.function=&retransmit_timer;
2780 newsk->dummy_th.source = skb->h.th->dest;
2781 newsk->dummy_th.dest = skb->h.th->source;
2782
2783 /*2784 * Swap these two, they are from our point of view. 2785 */2786
2787 newsk->daddr = saddr;
2788 newsk->saddr = daddr;
2789
2790 put_sock(newsk->num,newsk);
2791 newsk->dummy_th.res1 = 0;
2792 newsk->dummy_th.doff = 6;
2793 newsk->dummy_th.fin = 0;
2794 newsk->dummy_th.syn = 0;
2795 newsk->dummy_th.rst = 0;
2796 newsk->dummy_th.psh = 0;
2797 newsk->dummy_th.ack = 0;
2798 newsk->dummy_th.urg = 0;
2799 newsk->dummy_th.res2 = 0;
2800 newsk->acked_seq = skb->h.th->seq + 1;
2801 newsk->copied_seq = skb->h.th->seq + 1;
2802 newsk->socket = NULL;
2803
2804 /*2805 * Grab the ttl and tos values and use them 2806 */2807
2808 newsk->ip_ttl=sk->ip_ttl;
2809 newsk->ip_tos=skb->ip_hdr->tos;
2810
2811 /*2812 * Use 512 or whatever user asked for 2813 */2814
2815 /*2816 * Note use of sk->user_mss, since user has no direct access to newsk 2817 */2818
2819 rt=ip_rt_route(saddr, NULL,NULL);
2820
2821 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2822 newsk->window_clamp = rt->rt_window;
2823 else2824 newsk->window_clamp = 0;
2825
2826 if (sk->user_mss)
2827 newsk->mtu = sk->user_mss;
2828 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
2829 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2830 else2831 {2832 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */2833 if ((saddr ^ daddr) & default_mask(saddr))
2834 #else2835 if ((saddr ^ daddr) & dev->pa_mask)
2836 #endif2837 newsk->mtu = 576 - HEADER_SIZE;
2838 else2839 newsk->mtu = MAX_WINDOW;
2840 }2841
2842 /*2843 * But not bigger than device MTU 2844 */2845
2846 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2847
2848 /*2849 * This will min with what arrived in the packet 2850 */2851
2852 tcp_options(newsk,skb->h.th);
2853
2854 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2855 if (buff == NULL)
2856 {2857 sk->err = -ENOMEM;
2858 newsk->dead = 1;
2859 newsk->state = TCP_CLOSE;
2860 /* And this will destroy it */2861 release_sock(newsk);
2862 kfree_skb(skb, FREE_READ);
2863 tcp_statistics.TcpAttemptFails++;
2864 return;
2865 }2866
2867 buff->len = sizeof(structtcphdr)+4;
2868 buff->sk = newsk;
2869 buff->localroute = newsk->localroute;
2870
2871 t1 =(structtcphdr *) buff->data;
2872
2873 /*2874 * Put in the IP header and routing stuff. 2875 */2876
2877 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2878 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2879
2880 /*2881 * Something went wrong. 2882 */2883
2884 if (tmp < 0)
2885 {2886 sk->err = tmp;
2887 buff->free = 1;
2888 kfree_skb(buff,FREE_WRITE);
2889 newsk->dead = 1;
2890 newsk->state = TCP_CLOSE;
2891 release_sock(newsk);
2892 skb->sk = sk;
2893 kfree_skb(skb, FREE_READ);
2894 tcp_statistics.TcpAttemptFails++;
2895 return;
2896 }2897
2898 buff->len += tmp;
2899 t1 =(structtcphdr *)((char *)t1 +tmp);
2900
2901 memcpy(t1, skb->h.th, sizeof(*t1));
2902 buff->h.seq = newsk->write_seq;
2903 /*2904 * Swap the send and the receive. 2905 */2906 t1->dest = skb->h.th->source;
2907 t1->source = newsk->dummy_th.source;
2908 t1->seq = ntohl(newsk->write_seq++);
2909 t1->ack = 1;
2910 newsk->window = tcp_select_window(newsk);
2911 newsk->sent_seq = newsk->write_seq;
2912 t1->window = ntohs(newsk->window);
2913 t1->res1 = 0;
2914 t1->res2 = 0;
2915 t1->rst = 0;
2916 t1->urg = 0;
2917 t1->psh = 0;
2918 t1->syn = 1;
2919 t1->ack_seq = ntohl(skb->h.th->seq+1);
2920 t1->doff = sizeof(*t1)/4+1;
2921 ptr =(unsignedchar *)(t1+1);
2922 ptr[0] = 2;
2923 ptr[1] = 4;
2924 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2925 ptr[3] =(newsk->mtu) & 0xff;
2926
2927 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2928 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2929 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2930 skb->sk = newsk;
2931
2932 /*2933 * Charge the sock_buff to newsk. 2934 */2935
2936 sk->rmem_alloc -= skb->mem_len;
2937 newsk->rmem_alloc += skb->mem_len;
2938
2939 skb_queue_tail(&sk->receive_queue,skb);
2940 sk->ack_backlog++;
2941 release_sock(newsk);
2942 tcp_statistics.TcpOutSegs++;
2943 }2944
2945
2946 staticvoidtcp_close(structsock *sk, inttimeout)
/* */2947 {2948 /*2949 * We need to grab some memory, and put together a FIN, 2950 * and then put it into the queue to be sent.2951 */2952
2953 sk->inuse = 1;
2954
2955 if(sk->state == TCP_LISTEN)
2956 {2957 /* Special case */2958 tcp_set_state(sk, TCP_CLOSE);
2959 tcp_close_pending(sk);
2960 release_sock(sk);
2961 return;
2962 }2963
2964 sk->keepopen = 1;
2965 sk->shutdown = SHUTDOWN_MASK;
2966
2967 if (!sk->dead)
2968 sk->state_change(sk);
2969
2970 if (timeout == 0)
2971 {2972 structsk_buff *skb;
2973
2974 /*2975 * We need to flush the recv. buffs. We do this only on the2976 * descriptor close, not protocol-sourced closes, because the2977 * reader process may not have drained the data yet!2978 */2979
2980 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2981 kfree_skb(skb, FREE_READ);
2982 /*2983 * Get rid off any half-completed packets. 2984 */2985
2986 if (sk->partial)
2987 tcp_send_partial(sk);
2988 }2989
2990
2991 /*2992 * Timeout is not the same thing - however the code likes2993 * to send both the same way (sigh).2994 */2995
2996 if(timeout)
2997 {2998 tcp_set_state(sk, TCP_CLOSE); /* Dead */2999 }3000 else3001 {3002 if(tcp_close_state(sk,1)==1)
3003 {3004 tcp_send_fin(sk);
3005 }3006 }3007 release_sock(sk);
3008 }3009
3010
3011 /*3012 * This routine takes stuff off of the write queue,3013 * and puts it in the xmit queue. This happens as incoming acks3014 * open up the remote window for us.3015 */3016
3017 staticvoidtcp_write_xmit(structsock *sk)
/* */3018 {3019 structsk_buff *skb;
3020
3021 /*3022 * The bytes will have to remain here. In time closedown will3023 * empty the write queue and all will be happy 3024 */3025
3026 if(sk->zapped)
3027 return;
3028
3029 /*3030 * Anything on the transmit queue that fits the window can3031 * be added providing we are not3032 *3033 * a) retransmitting (Nagle's rule)3034 * b) exceeding our congestion window.3035 */3036
3037 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3038 before(skb->h.seq, sk->window_seq + 1) &&
3039 (sk->retransmits == 0 ||
3040 sk->ip_xmit_timeout != TIME_WRITE ||
3041 before(skb->h.seq, sk->rcv_ack_seq + 1))
3042 && sk->packets_out < sk->cong_window)
3043 {3044 IS_SKB(skb);
3045 skb_unlink(skb);
3046
3047 /*3048 * See if we really need to send the packet. 3049 */3050
3051 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3052 {3053 /*3054 * This is acked data. We can discard it. This 3055 * cannot currently occur.3056 */3057
3058 sk->retransmits = 0;
3059 kfree_skb(skb, FREE_WRITE);
3060 if (!sk->dead)
3061 sk->write_space(sk);
3062 }3063 else3064 {3065 structtcphdr *th;
3066 structiphdr *iph;
3067 intsize;
3068 /*3069 * put in the ack seq and window at this point rather than earlier,3070 * in order to keep them monotonic. We really want to avoid taking3071 * back window allocations. That's legal, but RFC1122 says it's frowned on.3072 * Ack and window will in general have changed since this packet was put3073 * on the write queue.3074 */3075 iph = (structiphdr *)(skb->data +
3076 skb->dev->hard_header_len);
3077 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3078 size = skb->len - (((unsignedchar *) th) - skb->data);
3079
3080 th->ack_seq = ntohl(sk->acked_seq);
3081 th->window = ntohs(tcp_select_window(sk));
3082
3083 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3084
3085 sk->sent_seq = skb->h.seq;
3086
3087 /*3088 * IP manages our queue for some crazy reason3089 */3090
3091 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3092
3093 /*3094 * Again we slide the timer wrongly3095 */3096
3097 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3098 }3099 }3100 }3101
3102
3103 /*3104 * This routine deals with incoming acks, but not outgoing ones.3105 */3106
3107 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3108 {3109 unsignedlongack;
3110 intflag = 0;
3111
3112 /* 3113 * 1 - there was data in packet as well as ack or new data is sent or 3114 * in shutdown state3115 * 2 - data from retransmit queue was acked and removed3116 * 4 - window shrunk or data from retransmit queue was acked and removed3117 */3118
3119 if(sk->zapped)
3120 return(1); /* Dead, cant ack any more so why bother */3121
3122 /*3123 * Have we discovered a larger window3124 */3125
3126 ack = ntohl(th->ack_seq);
3127
3128 if (ntohs(th->window) > sk->max_window)
3129 {3130 sk->max_window = ntohs(th->window);
3131 #ifdefCONFIG_INET_PCTCP3132 /* Hack because we don't send partial packets to non SWS3133 handling hosts */3134 sk->mss = min(sk->max_window>>1, sk->mtu);
3135 #else3136 sk->mss = min(sk->max_window, sk->mtu);
3137 #endif3138 }3139
3140 /*3141 * We have dropped back to keepalive timeouts. Thus we have3142 * no retransmits pending.3143 */3144
3145 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3146 sk->retransmits = 0;
3147
3148 /*3149 * If the ack is newer than sent or older than previous acks3150 * then we can probably ignore it.3151 */3152
3153 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3154 {3155 if(sk->debug)
3156 printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3157
3158 /*3159 * Keepalive processing.3160 */3161
3162 if (after(ack, sk->sent_seq))
3163 {3164 return(0);
3165 }3166
3167 /*3168 * Restart the keepalive timer.3169 */3170
3171 if (sk->keepopen)
3172 {3173 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3174 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3175 }3176 return(1);
3177 }3178
3179 /*3180 * If there is data set flag 13181 */3182
3183 if (len != th->doff*4)
3184 flag |= 1;
3185
3186 /*3187 * See if our window has been shrunk. 3188 */3189
3190 if (after(sk->window_seq, ack+ntohs(th->window)))
3191 {3192 /*3193 * We may need to move packets from the send queue3194 * to the write queue, if the window has been shrunk on us.3195 * The RFC says you are not allowed to shrink your window3196 * like this, but if the other end does, you must be able3197 * to deal with it.3198 */3199 structsk_buff *skb;
3200 structsk_buff *skb2;
3201 structsk_buff *wskb = NULL;
3202
3203 skb2 = sk->send_head;
3204 sk->send_head = NULL;
3205 sk->send_tail = NULL;
3206
3207 /*3208 * This is an artifact of a flawed concept. We want one3209 * queue and a smarter send routine when we send all.3210 */3211
3212 flag |= 4; /* Window changed */3213
3214 sk->window_seq = ack + ntohs(th->window);
3215 cli();
3216 while (skb2 != NULL)
3217 {3218 skb = skb2;
3219 skb2 = skb->link3;
3220 skb->link3 = NULL;
3221 if (after(skb->h.seq, sk->window_seq))
3222 {3223 if (sk->packets_out > 0)
3224 sk->packets_out--;
3225 /* We may need to remove this from the dev send list. */3226 if (skb->next != NULL)
3227 {3228 skb_unlink(skb);
3229 }3230 /* Now add it to the write_queue. */3231 if (wskb == NULL)
3232 skb_queue_head(&sk->write_queue,skb);
3233 else3234 skb_append(wskb,skb);
3235 wskb = skb;
3236 }3237 else3238 {3239 if (sk->send_head == NULL)
3240 {3241 sk->send_head = skb;
3242 sk->send_tail = skb;
3243 }3244 else3245 {3246 sk->send_tail->link3 = skb;
3247 sk->send_tail = skb;
3248 }3249 skb->link3 = NULL;
3250 }3251 }3252 sti();
3253 }3254
3255 /*3256 * Pipe has emptied3257 */3258
3259 if (sk->send_tail == NULL || sk->send_head == NULL)
3260 {3261 sk->send_head = NULL;
3262 sk->send_tail = NULL;
3263 sk->packets_out= 0;
3264 }3265
3266 /*3267 * Update the right hand window edge of the host3268 */3269
3270 sk->window_seq = ack + ntohs(th->window);
3271
3272 /*3273 * We don't want too many packets out there. 3274 */3275
3276 if (sk->ip_xmit_timeout == TIME_WRITE &&
3277 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3278 {3279 /* 3280 * This is Jacobson's slow start and congestion avoidance. 3281 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3282 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3283 * counter and increment it once every cwnd times. It's possible3284 * that this should be done only if sk->retransmits == 0. I'm3285 * interpreting "new data is acked" as including data that has3286 * been retransmitted but is just now being acked.3287 */3288 if (sk->cong_window < sk->ssthresh)
3289 /* 3290 * In "safe" area, increase3291 */3292 sk->cong_window++;
3293 else3294 {3295 /*3296 * In dangerous area, increase slowly. In theory this is3297 * sk->cong_window += 1 / sk->cong_window3298 */3299 if (sk->cong_count >= sk->cong_window)
3300 {3301 sk->cong_window++;
3302 sk->cong_count = 0;
3303 }3304 else3305 sk->cong_count++;
3306 }3307 }3308
3309 /*3310 * Remember the highest ack received.3311 */3312
3313 sk->rcv_ack_seq = ack;
3314
3315 /*3316 * If this ack opens up a zero window, clear backoff. It was3317 * being used to time the probes, and is probably far higher than3318 * it needs to be for normal retransmission.3319 */3320
3321 if (sk->ip_xmit_timeout == TIME_PROBE0)
3322 {3323 sk->retransmits = 0; /* Our probe was answered */3324
3325 /*3326 * Was it a usable window open ?3327 */3328
3329 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3330 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3331 {3332 sk->backoff = 0;
3333
3334 /*3335 * Recompute rto from rtt. this eliminates any backoff.3336 */3337
3338 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3339 if (sk->rto > 120*HZ)
3340 sk->rto = 120*HZ;
3341 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3342 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3343 .2 of a second is going to need huge windows (SIGH) */3344 sk->rto = 20;
3345 }3346 }3347
3348 /* 3349 * See if we can take anything off of the retransmit queue.3350 */3351
3352 while(sk->send_head != NULL)
3353 {3354 /* Check for a bug. */3355 if (sk->send_head->link3 &&
3356 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3357 printk("INET: tcp.c: *** bug send_list out of order.\n");
3358
3359 /*3360 * If our packet is before the ack sequence we can3361 * discard it as it's confirmed to have arrived the other end.3362 */3363
3364 if (before(sk->send_head->h.seq, ack+1))
3365 {3366 structsk_buff *oskb;
3367 if (sk->retransmits)
3368 {3369 /*3370 * We were retransmitting. don't count this in RTT est 3371 */3372 flag |= 2;
3373
3374 /*3375 * even though we've gotten an ack, we're still3376 * retransmitting as long as we're sending from3377 * the retransmit queue. Keeping retransmits non-zero3378 * prevents us from getting new data interspersed with3379 * retransmissions.3380 */3381
3382 if (sk->send_head->link3) /* Any more queued retransmits? */3383 sk->retransmits = 1;
3384 else3385 sk->retransmits = 0;
3386 }3387 /*3388 * Note that we only reset backoff and rto in the3389 * rtt recomputation code. And that doesn't happen3390 * if there were retransmissions in effect. So the3391 * first new packet after the retransmissions is3392 * sent with the backoff still in effect. Not until3393 * we get an ack from a non-retransmitted packet do3394 * we reset the backoff and rto. This allows us to deal3395 * with a situation where the network delay has increased3396 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3397 */3398
3399 /*3400 * We have one less packet out there. 3401 */3402
3403 if (sk->packets_out > 0)
3404 sk->packets_out --;
3405 /* 3406 * Wake up the process, it can probably write more. 3407 */3408 if (!sk->dead)
3409 sk->write_space(sk);
3410 oskb = sk->send_head;
3411
3412 if (!(flag&2)) /* Not retransmitting */3413 {3414 longm;
3415
3416 /*3417 * The following amusing code comes from Jacobson's3418 * article in SIGCOMM '88. Note that rtt and mdev3419 * are scaled versions of rtt and mean deviation.3420 * This is designed to be as fast as possible 3421 * m stands for "measurement".3422 */3423
3424 m = jiffies - oskb->when; /* RTT */3425 if(m<=0)
3426 m=1; /* IS THIS RIGHT FOR <0 ??? */3427 m -= (sk->rtt >> 3); /* m is now error in rtt est */3428 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3429 if (m < 0)
3430 m = -m; /* m is now abs(error) */3431 m -= (sk->mdev >> 2); /* similar update on mdev */3432 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3433
3434 /*3435 * Now update timeout. Note that this removes any backoff.3436 */3437
3438 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3439 if (sk->rto > 120*HZ)
3440 sk->rto = 120*HZ;
3441 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3442 sk->rto = 20;
3443 sk->backoff = 0;
3444 }3445 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3446 In this case as we just set it up */3447 cli();
3448 oskb = sk->send_head;
3449 IS_SKB(oskb);
3450 sk->send_head = oskb->link3;
3451 if (sk->send_head == NULL)
3452 {3453 sk->send_tail = NULL;
3454 }3455
3456 /*3457 * We may need to remove this from the dev send list. 3458 */3459
3460 if (oskb->next)
3461 skb_unlink(oskb);
3462 sti();
3463 kfree_skb(oskb, FREE_WRITE); /* write. */3464 if (!sk->dead)
3465 sk->write_space(sk);
3466 }3467 else3468 {3469 break;
3470 }3471 }3472
3473 /*3474 * XXX someone ought to look at this too.. at the moment, if skb_peek()3475 * returns non-NULL, we complete ignore the timer stuff in the else3476 * clause. We ought to organize the code so that else clause can3477 * (should) be executed regardless, possibly moving the PROBE timer3478 * reset over. The skb_peek() thing should only move stuff to the3479 * write queue, NOT also manage the timer functions.3480 */3481
3482 /*3483 * Maybe we can take some stuff off of the write queue,3484 * and put it onto the xmit queue.3485 */3486 if (skb_peek(&sk->write_queue) != NULL)
3487 {3488 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3489 (sk->retransmits == 0 ||
3490 sk->ip_xmit_timeout != TIME_WRITE ||
3491 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3492 && sk->packets_out < sk->cong_window)
3493 {3494 /*3495 * Add more data to the send queue.3496 */3497 flag |= 1;
3498 tcp_write_xmit(sk);
3499 }3500 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3501 sk->send_head == NULL &&
3502 sk->ack_backlog == 0 &&
3503 sk->state != TCP_TIME_WAIT)
3504 {3505 /*3506 * Data to queue but no room.3507 */3508 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3509 }3510 }3511 else3512 {3513 /*3514 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3515 * from TCP_CLOSE we don't do anything3516 *3517 * from anything else, if there is write data (or fin) pending,3518 * we use a TIME_WRITE timeout, else if keepalive we reset to3519 * a KEEPALIVE timeout, else we delete the timer.3520 *3521 * We do not set flag for nominal write data, otherwise we may3522 * force a state where we start to write itsy bitsy tidbits3523 * of data.3524 */3525
3526 switch(sk->state) {3527 caseTCP_TIME_WAIT:
3528 /*3529 * keep us in TIME_WAIT until we stop getting packets,3530 * reset the timeout.3531 */3532 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3533 break;
3534 caseTCP_CLOSE:
3535 /*3536 * don't touch the timer.3537 */3538 break;
3539 default:
3540 /*3541 * Must check send_head, write_queue, and ack_backlog3542 * to determine which timeout to use.3543 */3544 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3545 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3546 }elseif (sk->keepopen) {3547 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3548 }else{3549 del_timer(&sk->retransmit_timer);
3550 sk->ip_xmit_timeout = 0;
3551 }3552 break;
3553 }3554 }3555
3556 /*3557 * We have nothing queued but space to send. Send any partial3558 * packets immediately (end of Nagle rule application).3559 */3560
3561 if (sk->packets_out == 0 && sk->partial != NULL &&
3562 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3563 {3564 flag |= 1;
3565 tcp_send_partial(sk);
3566 }3567
3568 /*3569 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3570 * we are now waiting for an acknowledge to our FIN. The other end is3571 * already in TIME_WAIT.3572 *3573 * Move to TCP_CLOSE on success.3574 */3575
3576 if (sk->state == TCP_LAST_ACK)
3577 {3578 if (!sk->dead)
3579 sk->state_change(sk);
3580 if(sk->debug)
3581 printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3582 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3583 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3584 {3585 flag |= 1;
3586 tcp_set_state(sk,TCP_CLOSE);
3587 sk->shutdown = SHUTDOWN_MASK;
3588 }3589 }3590
3591 /*3592 * Incoming ACK to a FIN we sent in the case of our initiating the close.3593 *3594 * Move to FIN_WAIT2 to await a FIN from the other end. Set3595 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3596 */3597
3598 if (sk->state == TCP_FIN_WAIT1)
3599 {3600
3601 if (!sk->dead)
3602 sk->state_change(sk);
3603 if (sk->rcv_ack_seq == sk->write_seq)
3604 {3605 flag |= 1;
3606 sk->shutdown |= SEND_SHUTDOWN;
3607 tcp_set_state(sk, TCP_FIN_WAIT2);
3608 }3609 }3610
3611 /*3612 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3613 *3614 * Move to TIME_WAIT3615 */3616
3617 if (sk->state == TCP_CLOSING)
3618 {3619
3620 if (!sk->dead)
3621 sk->state_change(sk);
3622 if (sk->rcv_ack_seq == sk->write_seq)
3623 {3624 flag |= 1;
3625 tcp_time_wait(sk);
3626 }3627 }3628
3629 /*3630 * Final ack of a three way shake 3631 */3632
3633 if(sk->state==TCP_SYN_RECV)
3634 {3635 tcp_set_state(sk, TCP_ESTABLISHED);
3636 tcp_options(sk,th);
3637 sk->dummy_th.dest=th->source;
3638 sk->copied_seq = sk->acked_seq;
3639 if(!sk->dead)
3640 sk->state_change(sk);
3641 if(sk->max_window==0)
3642 {3643 sk->max_window=32; /* Sanity check */3644 sk->mss=min(sk->max_window,sk->mtu);
3645 }3646 }3647
3648 /*3649 * I make no guarantees about the first clause in the following3650 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3651 * what conditions "!flag" would be true. However I think the rest3652 * of the conditions would prevent that from causing any3653 * unnecessary retransmission. 3654 * Clearly if the first packet has expired it should be 3655 * retransmitted. The other alternative, "flag&2 && retransmits", is3656 * harder to explain: You have to look carefully at how and when the3657 * timer is set and with what timeout. The most recent transmission always3658 * sets the timer. So in general if the most recent thing has timed3659 * out, everything before it has as well. So we want to go ahead and3660 * retransmit some more. If we didn't explicitly test for this3661 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3662 * would not be true. If you look at the pattern of timing, you can3663 * show that rto is increased fast enough that the next packet would3664 * almost never be retransmitted immediately. Then you'd end up3665 * waiting for a timeout to send each packet on the retransmission3666 * queue. With my implementation of the Karn sampling algorithm,3667 * the timeout would double each time. The net result is that it would3668 * take a hideous amount of time to recover from a single dropped packet.3669 * It's possible that there should also be a test for TIME_WRITE, but3670 * I think as long as "send_head != NULL" and "retransmit" is on, we've3671 * got to be in real retransmission mode.3672 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3673 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3674 * As long as no further losses occur, this seems reasonable.3675 */3676
3677 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3678 (((flag&2) && sk->retransmits) ||
3679 (sk->send_head->when + sk->rto < jiffies)))
3680 {3681 if(sk->send_head->when + sk->rto < jiffies)
3682 tcp_retransmit(sk,0);
3683 else3684 {3685 tcp_do_retransmit(sk, 1);
3686 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3687 }3688 }3689
3690 return(1);
3691 }3692
3693
3694 /*3695 * Process the FIN bit. This now behaves as it is supposed to work3696 * and the FIN takes effect when it is validly part of sequence3697 * space. Not before when we get holes.3698 *3699 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3700 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3701 * TIME-WAIT)3702 *3703 * If we are in FINWAIT-1, a received FIN indicates simultaneous3704 * close and we go into CLOSING (and later onto TIME-WAIT)3705 *3706 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3707 *3708 */3709
3710 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3711 {3712 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3713
3714 if (!sk->dead)
3715 {3716 sk->state_change(sk);
3717 sock_wake_async(sk->socket, 1);
3718 }3719
3720 switch(sk->state)
3721 {3722 caseTCP_SYN_RECV:
3723 caseTCP_SYN_SENT:
3724 caseTCP_ESTABLISHED:
3725 /*3726 * move to CLOSE_WAIT, tcp_data() already handled3727 * sending the ack.3728 */3729 tcp_set_state(sk,TCP_CLOSE_WAIT);
3730 if (th->rst)
3731 sk->shutdown = SHUTDOWN_MASK;
3732 break;
3733
3734 caseTCP_CLOSE_WAIT:
3735 caseTCP_CLOSING:
3736 /*3737 * received a retransmission of the FIN, do3738 * nothing.3739 */3740 break;
3741 caseTCP_TIME_WAIT:
3742 /*3743 * received a retransmission of the FIN,3744 * restart the TIME_WAIT timer.3745 */3746 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3747 return(0);
3748 caseTCP_FIN_WAIT1:
3749 /*3750 * This case occurs when a simultaneous close3751 * happens, we must ack the received FIN and3752 * enter the CLOSING state.3753 *3754 * This causes a WRITE timeout, which will either3755 * move on to TIME_WAIT when we timeout, or resend3756 * the FIN properly (maybe we get rid of that annoying3757 * FIN lost hang). The TIME_WRITE code is already correct3758 * for handling this timeout.3759 */3760
3761 if(sk->ip_xmit_timeout != TIME_WRITE)
3762 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3763 tcp_set_state(sk,TCP_CLOSING);
3764 break;
3765 caseTCP_FIN_WAIT2:
3766 /*3767 * received a FIN -- send ACK and enter TIME_WAIT3768 */3769 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3770 sk->shutdown|=SHUTDOWN_MASK;
3771 tcp_set_state(sk,TCP_TIME_WAIT);
3772 break;
3773 caseTCP_CLOSE:
3774 /*3775 * already in CLOSE3776 */3777 break;
3778 default:
3779 tcp_set_state(sk,TCP_LAST_ACK);
3780
3781 /* Start the timers. */3782 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3783 return(0);
3784 }3785
3786 return(0);
3787 }3788
3789
3790
3791 /*3792 * This routine handles the data. If there is room in the buffer,3793 * it will be have already been moved into it. If there is no3794 * room, then we will just have to discard the packet.3795 */3796
3797 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */3798 unsignedlongsaddr, unsignedshortlen)
3799 {3800 structsk_buff *skb1, *skb2;
3801 structtcphdr *th;
3802 intdup_dumped=0;
3803 unsignedlongnew_seq;
3804 unsignedlongshut_seq;
3805
3806 th = skb->h.th;
3807 skb->len = len -(th->doff*4);
3808
3809 /*3810 * The bytes in the receive read/assembly queue has increased. Needed for the3811 * low memory discard algorithm 3812 */3813
3814 sk->bytes_rcv += skb->len;
3815
3816 if (skb->len == 0 && !th->fin)
3817 {3818 /* 3819 * Don't want to keep passing ack's back and forth. 3820 * (someone sent us dataless, boring frame)3821 */3822 if (!th->ack)
3823 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3824 kfree_skb(skb, FREE_READ);
3825 return(0);
3826 }3827
3828 /*3829 * We no longer have anyone receiving data on this connection.3830 */3831
3832 #ifndef TCP_DONT_RST_SHUTDOWN
3833
3834 if(sk->shutdown & RCV_SHUTDOWN)
3835 {3836 /*3837 * FIXME: BSD has some magic to avoid sending resets to3838 * broken 4.2 BSD keepalives. Much to my surprise a few non3839 * BSD stacks still have broken keepalives so we want to3840 * cope with it.3841 */3842
3843 if(skb->len) /* We don't care if it's just an ack or3844 a keepalive/window probe */3845 {3846 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */3847
3848 /* Do this the way 4.4BSD treats it. Not what I'd3849 regard as the meaning of the spec but it's what BSD3850 does and clearly they know everything 8) */3851
3852 /*3853 * This is valid because of two things3854 *3855 * a) The way tcp_data behaves at the bottom.3856 * b) A fin takes effect when read not when received.3857 */3858
3859 shut_seq=sk->acked_seq+1; /* Last byte */3860
3861 if(after(new_seq,shut_seq))
3862 {3863 if(sk->debug)
3864 printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3865 sk, new_seq, shut_seq, sk->blog);
3866 if(sk->dead)
3867 {3868 sk->acked_seq = new_seq + th->fin;
3869 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3870 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3871 tcp_statistics.TcpEstabResets++;
3872 tcp_set_state(sk,TCP_CLOSE);
3873 sk->err = EPIPE;
3874 sk->shutdown = SHUTDOWN_MASK;
3875 kfree_skb(skb, FREE_READ);
3876 return 0;
3877 }3878 }3879 }3880 }3881
3882 #endif3883
3884 /*3885 * Now we have to walk the chain, and figure out where this one3886 * goes into it. This is set up so that the last packet we received3887 * will be the first one we look at, that way if everything comes3888 * in order, there will be no performance loss, and if they come3889 * out of order we will be able to fit things in nicely.3890 *3891 * [AC: This is wrong. We should assume in order first and then walk3892 * forwards from the first hole based upon real traffic patterns.]3893 * 3894 */3895
3896 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */3897 {3898 skb_queue_head(&sk->receive_queue,skb);
3899 skb1= NULL;
3900 }3901 else3902 {3903 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3904 {3905 if(sk->debug)
3906 {3907 printk("skb1=%p :", skb1);
3908 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3909 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3910 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3911 sk->acked_seq);
3912 }3913
3914 /*3915 * Optimisation: Duplicate frame or extension of previous frame from3916 * same sequence point (lost ack case).3917 * The frame contains duplicate data or replaces a previous frame3918 * discard the previous frame (safe as sk->inuse is set) and put3919 * the new one in its place.3920 */3921
3922 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3923 {3924 skb_append(skb1,skb);
3925 skb_unlink(skb1);
3926 kfree_skb(skb1,FREE_READ);
3927 dup_dumped=1;
3928 skb1=NULL;
3929 break;
3930 }3931
3932 /*3933 * Found where it fits3934 */3935
3936 if (after(th->seq+1, skb1->h.th->seq))
3937 {3938 skb_append(skb1,skb);
3939 break;
3940 }3941
3942 /*3943 * See if we've hit the start. If so insert.3944 */3945 if (skb1 == skb_peek(&sk->receive_queue))
3946 {3947 skb_queue_head(&sk->receive_queue, skb);
3948 break;
3949 }3950 }3951 }3952
3953 /*3954 * Figure out what the ack value for this frame is3955 */3956
3957 th->ack_seq = th->seq + skb->len;
3958 if (th->syn)
3959 th->ack_seq++;
3960 if (th->fin)
3961 th->ack_seq++;
3962
3963 if (before(sk->acked_seq, sk->copied_seq))
3964 {3965 printk("*** tcp.c:tcp_data bug acked < copied\n");
3966 sk->acked_seq = sk->copied_seq;
3967 }3968
3969 /*3970 * Now figure out if we can ack anything. This is very messy because we really want two3971 * receive queues, a completed and an assembly queue. We also want only one transmit3972 * queue.3973 */3974
3975 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3976 {3977 if (before(th->seq, sk->acked_seq+1))
3978 {3979 intnewwindow;
3980
3981 if (after(th->ack_seq, sk->acked_seq))
3982 {3983 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3984 if (newwindow < 0)
3985 newwindow = 0;
3986 sk->window = newwindow;
3987 sk->acked_seq = th->ack_seq;
3988 }3989 skb->acked = 1;
3990
3991 /*3992 * When we ack the fin, we do the FIN 3993 * processing.3994 */3995
3996 if (skb->h.th->fin)
3997 {3998 tcp_fin(skb,sk,skb->h.th);
3999 }4000
4001 for(skb2 = skb->next;
4002 skb2 != (structsk_buff *)&sk->receive_queue;
4003 skb2 = skb2->next)
4004 {4005 if (before(skb2->h.th->seq, sk->acked_seq+1))
4006 {4007 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4008 {4009 newwindow = sk->window -
4010 (skb2->h.th->ack_seq - sk->acked_seq);
4011 if (newwindow < 0)
4012 newwindow = 0;
4013 sk->window = newwindow;
4014 sk->acked_seq = skb2->h.th->ack_seq;
4015 }4016 skb2->acked = 1;
4017 /*4018 * When we ack the fin, we do4019 * the fin handling.4020 */4021 if (skb2->h.th->fin)
4022 {4023 tcp_fin(skb,sk,skb->h.th);
4024 }4025
4026 /*4027 * Force an immediate ack.4028 */4029
4030 sk->ack_backlog = sk->max_ack_backlog;
4031 }4032 else4033 {4034 break;
4035 }4036 }4037
4038 /*4039 * This also takes care of updating the window.4040 * This if statement needs to be simplified.4041 */4042 if (!sk->delay_acks ||
4043 sk->ack_backlog >= sk->max_ack_backlog ||
4044 sk->bytes_rcv > sk->max_unacked || th->fin) {4045 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4046 }4047 else4048 {4049 sk->ack_backlog++;
4050 if(sk->debug)
4051 printk("Ack queued.\n");
4052 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4053 }4054 }4055 }4056
4057 /*4058 * If we've missed a packet, send an ack.4059 * Also start a timer to send another.4060 */4061
4062 if (!skb->acked)
4063 {4064
4065 /*4066 * This is important. If we don't have much room left,4067 * we need to throw out a few packets so we have a good4068 * window. Note that mtu is used, not mss, because mss is really4069 * for the send side. He could be sending us stuff as large as mtu.4070 */4071
4072 while (sk->prot->rspace(sk) < sk->mtu)
4073 {4074 skb1 = skb_peek(&sk->receive_queue);
4075 if (skb1 == NULL)
4076 {4077 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4078 break;
4079 }4080
4081 /*4082 * Don't throw out something that has been acked. 4083 */4084
4085 if (skb1->acked)
4086 {4087 break;
4088 }4089
4090 skb_unlink(skb1);
4091 kfree_skb(skb1, FREE_READ);
4092 }4093 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4094 sk->ack_backlog++;
4095 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4096 }4097 else4098 {4099 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4100 }4101
4102 /*4103 * Now tell the user we may have some data. 4104 */4105
4106 if (!sk->dead)
4107 {4108 if(sk->debug)
4109 printk("Data wakeup.\n");
4110 sk->data_ready(sk,0);
4111 }4112 return(0);
4113 }4114
4115
4116 /*4117 * This routine is only called when we have urgent data4118 * signalled. Its the 'slow' part of tcp_urg. It could be4119 * moved inline now as tcp_urg is only called from one4120 * place. We handle URGent data wrong. We have to - as4121 * BSD still doesn't use the correction from RFC961.4122 */4123
4124 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4125 {4126 unsignedlongptr = ntohs(th->urg_ptr);
4127
4128 if (ptr)
4129 ptr--;
4130 ptr += th->seq;
4131
4132 /* ignore urgent data that we've already seen and read */4133 if (after(sk->copied_seq, ptr))
4134 return;
4135
4136 /* do we already have a newer (or duplicate) urgent pointer? */4137 if (sk->urg_data && !after(ptr, sk->urg_seq))
4138 return;
4139
4140 /* tell the world about our new urgent pointer */4141 if (sk->proc != 0) {4142 if (sk->proc > 0) {4143 kill_proc(sk->proc, SIGURG, 1);
4144 }else{4145 kill_pg(-sk->proc, SIGURG, 1);
4146 }4147 }4148 sk->urg_data = URG_NOTYET;
4149 sk->urg_seq = ptr;
4150 }4151
4152 /*4153 * This is the 'fast' part of urgent handling.4154 */4155
4156 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4157 unsignedlongsaddr, unsignedlonglen)
4158 {4159 unsignedlongptr;
4160
4161 /*4162 * Check if we get a new urgent pointer - normally not 4163 */4164
4165 if (th->urg)
4166 tcp_check_urg(sk,th);
4167
4168 /*4169 * Do we wait for any urgent data? - normally not4170 */4171
4172 if (sk->urg_data != URG_NOTYET)
4173 return 0;
4174
4175 /*4176 * Is the urgent pointer pointing into this packet? 4177 */4178
4179 ptr = sk->urg_seq - th->seq + th->doff*4;
4180 if (ptr >= len)
4181 return 0;
4182
4183 /*4184 * Ok, got the correct packet, update info 4185 */4186
4187 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4188 if (!sk->dead)
4189 sk->data_ready(sk,0);
4190 return 0;
4191 }4192
4193 /*4194 * This will accept the next outstanding connection. 4195 */4196
4197 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4198 {4199 structsock *newsk;
4200 structsk_buff *skb;
4201
4202 /*4203 * We need to make sure that this socket is listening,4204 * and that it has something pending.4205 */4206
4207 if (sk->state != TCP_LISTEN)
4208 {4209 sk->err = EINVAL;
4210 return(NULL);
4211 }4212
4213 /* Avoid the race. */4214 cli();
4215 sk->inuse = 1;
4216
4217 while((skb = tcp_dequeue_established(sk)) == NULL)
4218 {4219 if (flags & O_NONBLOCK)
4220 {4221 sti();
4222 release_sock(sk);
4223 sk->err = EAGAIN;
4224 return(NULL);
4225 }4226
4227 release_sock(sk);
4228 interruptible_sleep_on(sk->sleep);
4229 if (current->signal & ~current->blocked)
4230 {4231 sti();
4232 sk->err = ERESTARTSYS;
4233 return(NULL);
4234 }4235 sk->inuse = 1;
4236 }4237 sti();
4238
4239 /*4240 * Now all we need to do is return skb->sk. 4241 */4242
4243 newsk = skb->sk;
4244
4245 kfree_skb(skb, FREE_READ);
4246 sk->ack_backlog--;
4247 release_sock(sk);
4248 return(newsk);
4249 }4250
4251
4252 /*4253 * This will initiate an outgoing connection. 4254 */4255
4256 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4257 {4258 structsk_buff *buff;
4259 structdevice *dev=NULL;
4260 unsignedchar *ptr;
4261 inttmp;
4262 intatype;
4263 structtcphdr *t1;
4264 structrtable *rt;
4265
4266 if (sk->state != TCP_CLOSE)
4267 {4268 return(-EISCONN);
4269 }4270
4271 if (addr_len < 8)
4272 return(-EINVAL);
4273
4274 if (usin->sin_family && usin->sin_family != AF_INET)
4275 return(-EAFNOSUPPORT);
4276
4277 /*4278 * connect() to INADDR_ANY means loopback (BSD'ism).4279 */4280
4281 if(usin->sin_addr.s_addr==INADDR_ANY)
4282 usin->sin_addr.s_addr=ip_my_addr();
4283
4284 /*4285 * Don't want a TCP connection going to a broadcast address 4286 */4287
4288 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4289 return -ENETUNREACH;
4290
4291 sk->inuse = 1;
4292 sk->daddr = usin->sin_addr.s_addr;
4293 sk->write_seq = tcp_init_seq();
4294 sk->window_seq = sk->write_seq;
4295 sk->rcv_ack_seq = sk->write_seq -1;
4296 sk->err = 0;
4297 sk->dummy_th.dest = usin->sin_port;
4298 release_sock(sk);
4299
4300 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4301 if (buff == NULL)
4302 {4303 return(-ENOMEM);
4304 }4305 sk->inuse = 1;
4306 buff->len = 24;
4307 buff->sk = sk;
4308 buff->free = 0;
4309 buff->localroute = sk->localroute;
4310
4311 t1 = (structtcphdr *) buff->data;
4312
4313 /*4314 * Put in the IP header and routing stuff. 4315 */4316
4317 rt=ip_rt_route(sk->daddr, NULL, NULL);
4318
4319
4320 /*4321 * We need to build the routing stuff from the things saved in skb. 4322 */4323
4324 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4325 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4326 if (tmp < 0)
4327 {4328 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4329 release_sock(sk);
4330 return(-ENETUNREACH);
4331 }4332
4333 buff->len += tmp;
4334 t1 = (structtcphdr *)((char *)t1 +tmp);
4335
4336 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4337 t1->seq = ntohl(sk->write_seq++);
4338 sk->sent_seq = sk->write_seq;
4339 buff->h.seq = sk->write_seq;
4340 t1->ack = 0;
4341 t1->window = 2;
4342 t1->res1=0;
4343 t1->res2=0;
4344 t1->rst = 0;
4345 t1->urg = 0;
4346 t1->psh = 0;
4347 t1->syn = 1;
4348 t1->urg_ptr = 0;
4349 t1->doff = 6;
4350 /* use 512 or whatever user asked for */4351
4352 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4353 sk->window_clamp=rt->rt_window;
4354 else4355 sk->window_clamp=0;
4356
4357 if (sk->user_mss)
4358 sk->mtu = sk->user_mss;
4359 elseif(rt!=NULL && (rt->rt_flags&RTF_MTU))
4360 sk->mtu = rt->rt_mss;
4361 else4362 {4363 #ifdefCONFIG_INET_SNARL4364 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4365 #else4366 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4367 #endif4368 sk->mtu = 576 - HEADER_SIZE;
4369 else4370 sk->mtu = MAX_WINDOW;
4371 }4372 /*4373 * but not bigger than device MTU 4374 */4375
4376 if(sk->mtu <32)
4377 sk->mtu = 32; /* Sanity limit */4378
4379 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4380
4381 /*4382 * Put in the TCP options to say MTU. 4383 */4384
4385 ptr = (unsignedchar *)(t1+1);
4386 ptr[0] = 2;
4387 ptr[1] = 4;
4388 ptr[2] = (sk->mtu) >> 8;
4389 ptr[3] = (sk->mtu) & 0xff;
4390 tcp_send_check(t1, sk->saddr, sk->daddr,
4391 sizeof(structtcphdr) + 4, sk);
4392
4393 /*4394 * This must go first otherwise a really quick response will get reset. 4395 */4396
4397 tcp_set_state(sk,TCP_SYN_SENT);
4398 sk->rto = TCP_TIMEOUT_INIT;
4399 #if 0 /* we already did this */4400 init_timer(&sk->retransmit_timer);
4401 #endif4402 sk->retransmit_timer.function=&retransmit_timer;
4403 sk->retransmit_timer.data = (unsignedlong)sk;
4404 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4405 sk->retransmits = TCP_SYN_RETRIES;
4406
4407 sk->prot->queue_xmit(sk, dev, buff, 0);
4408 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4409 tcp_statistics.TcpActiveOpens++;
4410 tcp_statistics.TcpOutSegs++;
4411
4412 release_sock(sk);
4413 return(0);
4414 }4415
4416
4417 /* This functions checks to see if the tcp header is actually acceptable. */4418 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4419 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4420 {4421 unsignedlongnext_seq;
4422
4423 next_seq = len - 4*th->doff;
4424 if (th->fin)
4425 next_seq++;
4426 /* if we have a zero window, we can't have any data in the packet.. */4427 if (next_seq && !sk->window)
4428 gotoignore_it;
4429 next_seq += th->seq;
4430
4431 /*4432 * This isn't quite right. sk->acked_seq could be more recent4433 * than sk->window. This is however close enough. We will accept4434 * slightly more packets than we should, but it should not cause4435 * problems unless someone is trying to forge packets.4436 */4437
4438 /* have we already seen all of this packet? */4439 if (!after(next_seq+1, sk->acked_seq))
4440 gotoignore_it;
4441 /* or does it start beyond the window? */4442 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4443 gotoignore_it;
4444
4445 /* ok, at least part of this packet would seem interesting.. */4446 return 1;
4447
4448 ignore_it:
4449 if (th->rst)
4450 return 0;
4451
4452 /*4453 * Send a reset if we get something not ours and we are4454 * unsynchronized. Note: We don't do anything to our end. We4455 * are just killing the bogus remote connection then we will4456 * connect again and it will work (with luck).4457 */4458
4459 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4460 {4461 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4462 return 1;
4463 }4464
4465 /* Try to resync things. */4466 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4467 return 0;
4468 }4469
4470 /*4471 * When we get a reset we do this.4472 */4473
4474 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4475 {4476 sk->zapped = 1;
4477 sk->err = ECONNRESET;
4478 if (sk->state == TCP_SYN_SENT)
4479 sk->err = ECONNREFUSED;
4480 if (sk->state == TCP_CLOSE_WAIT)
4481 sk->err = EPIPE;
4482 #ifdef TCP_DO_RFC1337
4483 /*4484 * Time wait assassination protection [RFC1337]4485 */4486 if(sk->state!=TCP_TIME_WAIT)
4487 {4488 tcp_set_state(sk,TCP_CLOSE);
4489 sk->shutdown = SHUTDOWN_MASK;
4490 }4491 #else4492 tcp_set_state(sk,TCP_CLOSE);
4493 sk->shutdown = SHUTDOWN_MASK;
4494 #endif4495 if (!sk->dead)
4496 sk->state_change(sk);
4497 kfree_skb(skb, FREE_READ);
4498 release_sock(sk);
4499 return(0);
4500 }4501
4502 /*4503 * A TCP packet has arrived.4504 */4505
4506 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4507 unsignedlongdaddr, unsignedshortlen,
4508 unsignedlongsaddr, intredo, structinet_protocol * protocol)
4509 {4510 structtcphdr *th;
4511 structsock *sk;
4512 intsyn_ok=0;
4513
4514 if (!skb)
4515 {4516 printk("IMPOSSIBLE 1\n");
4517 return(0);
4518 }4519
4520 if (!dev)
4521 {4522 printk("IMPOSSIBLE 2\n");
4523 return(0);
4524 }4525
4526 tcp_statistics.TcpInSegs++;
4527
4528 if(skb->pkt_type!=PACKET_HOST)
4529 {4530 kfree_skb(skb,FREE_READ);
4531 return(0);
4532 }4533
4534 th = skb->h.th;
4535
4536 /*4537 * Find the socket.4538 */4539
4540 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4541
4542 /*4543 * If this socket has got a reset it's to all intents and purposes 4544 * really dead. Count closed sockets as dead.4545 *4546 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4547 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4548 * exist so should cause resets as if the port was unreachable.4549 */4550
4551 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4552 sk=NULL;
4553
4554 if (!redo)
4555 {4556 if (tcp_check(th, len, saddr, daddr ))
4557 {4558 skb->sk = NULL;
4559 kfree_skb(skb,FREE_READ);
4560 /*4561 * We don't release the socket because it was4562 * never marked in use.4563 */4564 return(0);
4565 }4566 th->seq = ntohl(th->seq);
4567
4568 /* See if we know about the socket. */4569 if (sk == NULL)
4570 {4571 /*4572 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4573 */4574 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4575 skb->sk = NULL;
4576 /*4577 * Discard frame4578 */4579 kfree_skb(skb, FREE_READ);
4580 return(0);
4581 }4582
4583 skb->len = len;
4584 skb->acked = 0;
4585 skb->used = 0;
4586 skb->free = 0;
4587 skb->saddr = daddr;
4588 skb->daddr = saddr;
4589
4590 /* We may need to add it to the backlog here. */4591 cli();
4592 if (sk->inuse)
4593 {4594 skb_queue_tail(&sk->back_log, skb);
4595 sti();
4596 return(0);
4597 }4598 sk->inuse = 1;
4599 sti();
4600 }4601 else4602 {4603 if (sk==NULL)
4604 {4605 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4606 skb->sk = NULL;
4607 kfree_skb(skb, FREE_READ);
4608 return(0);
4609 }4610 }4611
4612
4613 if (!sk->prot)
4614 {4615 printk("IMPOSSIBLE 3\n");
4616 return(0);
4617 }4618
4619
4620 /*4621 * Charge the memory to the socket. 4622 */4623
4624 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
4625 {4626 kfree_skb(skb, FREE_READ);
4627 release_sock(sk);
4628 return(0);
4629 }4630
4631 skb->sk=sk;
4632 sk->rmem_alloc += skb->mem_len;
4633
4634 /*4635 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4636 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4637 * compatibility. We also set up variables more thoroughly [Karn notes in the4638 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4639 */4640
4641 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4642 {4643
4644 /*4645 * Now deal with unusual cases.4646 */4647
4648 if(sk->state==TCP_LISTEN)
4649 {4650 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4651 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4652
4653 /*4654 * We don't care for RST, and non SYN are absorbed (old segments)4655 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4656 * netmask on a running connection it can go broadcast. Even Sun's have4657 * this problem so I'm ignoring it 4658 */4659
4660 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4661 {4662 kfree_skb(skb, FREE_READ);
4663 release_sock(sk);
4664 return 0;
4665 }4666
4667 /* 4668 * Guess we need to make a new socket up 4669 */4670
4671 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4672
4673 /*4674 * Now we have several options: In theory there is nothing else4675 * in the frame. KA9Q has an option to send data with the syn,4676 * BSD accepts data with the syn up to the [to be] advertised window4677 * and Solaris 2.1 gives you a protocol error. For now we just ignore4678 * it, that fits the spec precisely and avoids incompatibilities. It4679 * would be nice in future to drop through and process the data.4680 */4681
4682 release_sock(sk);
4683 return 0;
4684 }4685
4686 /* retransmitted SYN? */4687 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4688 {4689 kfree_skb(skb, FREE_READ);
4690 release_sock(sk);
4691 return 0;
4692 }4693
4694 /*4695 * SYN sent means we have to look for a suitable ack and either reset4696 * for bad matches or go to connected 4697 */4698
4699 if(sk->state==TCP_SYN_SENT)
4700 {4701 /* Crossed SYN or previous junk segment */4702 if(th->ack)
4703 {4704 /* We got an ack, but it's not a good ack */4705 if(!tcp_ack(sk,th,saddr,len))
4706 {4707 /* Reset the ack - its an ack from a 4708 different connection [ th->rst is checked in tcp_reset()] */4709 tcp_statistics.TcpAttemptFails++;
4710 tcp_reset(daddr, saddr, th,
4711 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4712 kfree_skb(skb, FREE_READ);
4713 release_sock(sk);
4714 return(0);
4715 }4716 if(th->rst)
4717 returntcp_std_reset(sk,skb);
4718 if(!th->syn)
4719 {4720 /* A valid ack from a different connection4721 start. Shouldn't happen but cover it */4722 kfree_skb(skb, FREE_READ);
4723 release_sock(sk);
4724 return 0;
4725 }4726 /*4727 * Ok.. it's good. Set up sequence numbers and4728 * move to established.4729 */4730 syn_ok=1; /* Don't reset this connection for the syn */4731 sk->acked_seq=th->seq+1;
4732 sk->fin_seq=th->seq;
4733 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4734 tcp_set_state(sk, TCP_ESTABLISHED);
4735 tcp_options(sk,th);
4736 sk->dummy_th.dest=th->source;
4737 sk->copied_seq = sk->acked_seq;
4738 if(!sk->dead)
4739 {4740 sk->state_change(sk);
4741 sock_wake_async(sk->socket, 0);
4742 }4743 if(sk->max_window==0)
4744 {4745 sk->max_window = 32;
4746 sk->mss = min(sk->max_window, sk->mtu);
4747 }4748 }4749 else4750 {4751 /* See if SYN's cross. Drop if boring */4752 if(th->syn && !th->rst)
4753 {4754 /* Crossed SYN's are fine - but talking to4755 yourself is right out... */4756 if(sk->saddr==saddr && sk->daddr==daddr &&
4757 sk->dummy_th.source==th->source &&
4758 sk->dummy_th.dest==th->dest)
4759 {4760 tcp_statistics.TcpAttemptFails++;
4761 returntcp_std_reset(sk,skb);
4762 }4763 tcp_set_state(sk,TCP_SYN_RECV);
4764
4765 /*4766 * FIXME:4767 * Must send SYN|ACK here4768 */4769 }4770 /* Discard junk segment */4771 kfree_skb(skb, FREE_READ);
4772 release_sock(sk);
4773 return 0;
4774 }4775 /*4776 * SYN_RECV with data maybe.. drop through4777 */4778 gotorfc_step6;
4779 }4780
4781 /*4782 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is4783 * a more complex suggestion for fixing these reuse issues in RFC16444784 * but not yet ready for general use. Also see RFC1379.4785 */4786
4787 #defineBSD_TIME_WAIT4788 #ifdefBSD_TIME_WAIT4789 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4790 after(th->seq, sk->acked_seq) && !th->rst)
4791 {4792 longseq=sk->write_seq;
4793 if(sk->debug)
4794 printk("Doing a BSD time wait\n");
4795 tcp_statistics.TcpEstabResets++;
4796 sk->rmem_alloc -= skb->mem_len;
4797 skb->sk = NULL;
4798 sk->err=ECONNRESET;
4799 tcp_set_state(sk, TCP_CLOSE);
4800 sk->shutdown = SHUTDOWN_MASK;
4801 release_sock(sk);
4802 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4803 if (sk && sk->state==TCP_LISTEN)
4804 {4805 sk->inuse=1;
4806 skb->sk = sk;
4807 sk->rmem_alloc += skb->mem_len;
4808 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4809 release_sock(sk);
4810 return 0;
4811 }4812 kfree_skb(skb, FREE_READ);
4813 return 0;
4814 }4815 #endif4816 }4817
4818 /*4819 * We are now in normal data flow (see the step list in the RFC)4820 * Note most of these are inline now. I'll inline the lot when4821 * I have time to test it hard and look at what gcc outputs 4822 */4823
4824 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4825 {4826 kfree_skb(skb, FREE_READ);
4827 release_sock(sk);
4828 return 0;
4829 }4830
4831 if(th->rst)
4832 returntcp_std_reset(sk,skb);
4833
4834 /*4835 * !syn_ok is effectively the state test in RFC793.4836 */4837
4838 if(th->syn && !syn_ok)
4839 {4840 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4841 returntcp_std_reset(sk,skb);
4842 }4843
4844 /*4845 * Process the ACK4846 */4847
4848
4849 if(th->ack && !tcp_ack(sk,th,saddr,len))
4850 {4851 /*4852 * Our three way handshake failed.4853 */4854
4855 if(sk->state==TCP_SYN_RECV)
4856 {4857 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4858 }4859 kfree_skb(skb, FREE_READ);
4860 release_sock(sk);
4861 return 0;
4862 }4863
4864 rfc_step6: /* I'll clean this up later */4865
4866 /*4867 * Process urgent data4868 */4869
4870 if(tcp_urg(sk, th, saddr, len))
4871 {4872 kfree_skb(skb, FREE_READ);
4873 release_sock(sk);
4874 return 0;
4875 }4876
4877
4878 /*4879 * Process the encapsulated data4880 */4881
4882 if(tcp_data(skb,sk, saddr, len))
4883 {4884 kfree_skb(skb, FREE_READ);
4885 release_sock(sk);
4886 return 0;
4887 }4888
4889 /*4890 * And done4891 */4892
4893 release_sock(sk);
4894 return 0;
4895 }4896
4897 /*4898 * This routine sends a packet with an out of date sequence4899 * number. It assumes the other end will try to ack it.4900 */4901
4902 staticvoidtcp_write_wakeup(structsock *sk)
/* */4903 {4904 structsk_buff *buff;
4905 structtcphdr *t1;
4906 structdevice *dev=NULL;
4907 inttmp;
4908
4909 if (sk->zapped)
4910 return; /* After a valid reset we can send no more */4911
4912 /*4913 * Write data can still be transmitted/retransmitted in the4914 * following states. If any other state is encountered, return.4915 * [listen/close will never occur here anyway]4916 */4917
4918 if (sk->state != TCP_ESTABLISHED &&
4919 sk->state != TCP_CLOSE_WAIT &&
4920 sk->state != TCP_FIN_WAIT1 &&
4921 sk->state != TCP_LAST_ACK &&
4922 sk->state != TCP_CLOSING4923 )
4924 {4925 return;
4926 }4927
4928 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4929 if (buff == NULL)
4930 return;
4931
4932 buff->len = sizeof(structtcphdr);
4933 buff->free = 1;
4934 buff->sk = sk;
4935 buff->localroute = sk->localroute;
4936
4937 t1 = (structtcphdr *) buff->data;
4938
4939 /* Put in the IP header and routing stuff. */4940 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4941 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4942 if (tmp < 0)
4943 {4944 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4945 return;
4946 }4947
4948 buff->len += tmp;
4949 t1 = (structtcphdr *)((char *)t1 +tmp);
4950
4951 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4952
4953 /*4954 * Use a previous sequence.4955 * This should cause the other end to send an ack.4956 */4957
4958 t1->seq = htonl(sk->sent_seq-1);
4959 t1->ack = 1;
4960 t1->res1= 0;
4961 t1->res2= 0;
4962 t1->rst = 0;
4963 t1->urg = 0;
4964 t1->psh = 0;
4965 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */4966 t1->syn = 0;
4967 t1->ack_seq = ntohl(sk->acked_seq);
4968 t1->window = ntohs(tcp_select_window(sk));
4969 t1->doff = sizeof(*t1)/4;
4970 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4971 /*4972 * Send it and free it.4973 * This will prevent the timer from automatically being restarted.4974 */4975 sk->prot->queue_xmit(sk, dev, buff, 1);
4976 tcp_statistics.TcpOutSegs++;
4977 }4978
4979 /*4980 * A window probe timeout has occurred.4981 */4982
4983 voidtcp_send_probe0(structsock *sk)
/* */4984 {4985 if (sk->zapped)
4986 return; /* After a valid reset we can send no more */4987
4988 tcp_write_wakeup(sk);
4989
4990 sk->backoff++;
4991 sk->rto = min(sk->rto << 1, 120*HZ);
4992 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4993 sk->retransmits++;
4994 sk->prot->retransmits ++;
4995 }4996
4997 /*4998 * Socket option code for TCP. 4999 */5000
5001 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5002 {5003 intval,err;
5004
5005 if(level!=SOL_TCP)
5006 returnip_setsockopt(sk,level,optname,optval,optlen);
5007
5008 if (optval == NULL)
5009 return(-EINVAL);
5010
5011 err=verify_area(VERIFY_READ, optval, sizeof(int));
5012 if(err)
5013 returnerr;
5014
5015 val = get_fs_long((unsignedlong *)optval);
5016
5017 switch(optname)
5018 {5019 caseTCP_MAXSEG:
5020 /*5021 * values greater than interface MTU won't take effect. however at5022 * the point when this call is done we typically don't yet know5023 * which interface is going to be used5024 */5025 if(val<1||val>MAX_WINDOW)
5026 return -EINVAL;
5027 sk->user_mss=val;
5028 return 0;
5029 caseTCP_NODELAY:
5030 sk->nonagle=(val==0)?0:1;
5031 return 0;
5032 default:
5033 return(-ENOPROTOOPT);
5034 }5035 }5036
5037 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5038 {5039 intval,err;
5040
5041 if(level!=SOL_TCP)
5042 returnip_getsockopt(sk,level,optname,optval,optlen);
5043
5044 switch(optname)
5045 {5046 caseTCP_MAXSEG:
5047 val=sk->user_mss;
5048 break;
5049 caseTCP_NODELAY:
5050 val=sk->nonagle;
5051 break;
5052 default:
5053 return(-ENOPROTOOPT);
5054 }5055 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5056 if(err)
5057 returnerr;
5058 put_fs_long(sizeof(int),(unsignedlong *) optlen);
5059
5060 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5061 if(err)
5062 returnerr;
5063 put_fs_long(val,(unsignedlong *)optval);
5064
5065 return(0);
5066 }5067
5068
5069 structprototcp_prot = {5070 sock_wmalloc,
5071 sock_rmalloc,
5072 sock_wfree,
5073 sock_rfree,
5074 sock_rspace,
5075 sock_wspace,
5076 tcp_close,
5077 tcp_read,
5078 tcp_write,
5079 tcp_sendto,
5080 tcp_recvfrom,
5081 ip_build_header,
5082 tcp_connect,
5083 tcp_accept,
5084 ip_queue_xmit,
5085 tcp_retransmit,
5086 tcp_write_wakeup,
5087 tcp_read_wakeup,
5088 tcp_rcv,
5089 tcp_select,
5090 tcp_ioctl,
5091 NULL,
5092 tcp_shutdown,
5093 tcp_setsockopt,
5094 tcp_getsockopt,
5095 128,
5096 0,
5097 {NULL,},
5098 "TCP",
5099 0, 0
5100 };