1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: @(#)tcp.c 1.0.16 05/25/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@no.unit.nvg> 20 * 21 * Fixes: 22 * Alan Cox : Numerous verify_area() calls 23 * Alan Cox : Set the ACK bit on a reset 24 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 25 * and was trying to connect (tcp_err()). 26 * Alan Cox : All icmp error handling was broken 27 * pointers passed where wrong and the 28 * socket was looked up backwards. Nobody 29 * tested any icmp error code obviously. 30 * Alan Cox : tcp_err() now handled properly. It wakes people 31 * on errors. select behaves and the icmp error race 32 * has gone by moving it into sock.c 33 * Alan Cox : tcp_reset() fixed to work for everything not just 34 * packets for unknown sockets. 35 * Alan Cox : tcp option processing. 36 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] 37 * Herp Rosmanith : More reset fixes 38 * Alan Cox : No longer acks invalid rst frames. Acking 39 * any kind of RST is right out. 40 * Alan Cox : Sets an ignore me flag on an rst receive 41 * otherwise odd bits of prattle escape still 42 * Alan Cox : Fixed another acking RST frame bug. Should stop 43 * LAN workplace lockups. 44 * Alan Cox : Some tidyups using the new skb list facilities 45 * Alan Cox : sk->keepopen now seems to work 46 * Alan Cox : Pulls options out correctly on accepts 47 * Alan Cox : Fixed assorted sk->rqueue->next errors 48 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. 49 * Alan Cox : Tidied tcp_data to avoid a potential nasty. 50 * Alan Cox : Added some better commenting, as the tcp is hard to follow 51 * Alan Cox : Removed incorrect check for 20 * psh 52 * Michael O'Reilly : ack < copied bug fix. 53 * Johannes Stille : Misc tcp fixes (not all in yet). 54 * Alan Cox : FIN with no memory -> CRASH 55 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. 56 * Alan Cox : Added TCP options (SOL_TCP) 57 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. 58 * Alan Cox : Use ip_tos/ip_ttl settings. 59 * Alan Cox : Handle FIN (more) properly (we hope). 60 * Alan Cox : RST frames sent on unsynchronised state ack error/ 61 * Alan Cox : Put in missing check for SYN bit. 62 * Alan Cox : Added tcp_select_window() aka NET2E 63 * window non shrink trick. 64 * Alan Cox : Added a couple of small NET2E timer fixes 65 * Charles Hedrick : TCP fixes 66 * Toomas Tamm : TCP window fixes 67 * Alan Cox : Small URG fix to rlogin ^C ack fight 68 * Charles Hedrick : Rewrote most of it to actually work 69 * Linus : Rewrote tcp_read() and URG handling 70 * completely 71 * Gerhard Koerting: Fixed some missing timer handling 72 * Matthew Dillon : Reworked TCP machine states as per RFC 73 * Gerhard Koerting: PC/TCP workarounds 74 * Adam Caldwell : Assorted timer/timing errors 75 * Matthew Dillon : Fixed another RST bug 76 * Alan Cox : Move to kernel side addressing changes. 77 * Alan Cox : Beginning work on TCP fastpathing (not yet usable) 78 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 79 * Alan Cox : TCP fast path debugging 80 * Alan Cox : Window clamping 81 * Michael Riepe : Bug in tcp_check() 82 * Matt Dillon : More TCP improvements and RST bug fixes 83 * Matt Dillon : Yet more small nasties remove from the TCP code 84 * (Be very nice to this man if tcp finally works 100%) 8) 85 * Alan Cox : BSD accept semantics. 86 * Alan Cox : Reset on closedown bug. 87 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 88 * Michael Pall : Handle select() after URG properly in all cases. 89 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). 90 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. 91 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. 92 * Alan Cox : Changed the semantics of sk->socket to 93 * fix a race and a signal problem with 94 * accept() and async I/O. 95 * Alan Cox : Relaxed the rules on tcp_sendto(). 96 * Yury Shevchuk : Really fixed accept() blocking problem. 97 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 98 * clients/servers which listen in on 99 * fixed ports. 100 * Alan Cox : Cleaned the above up and shrank it to 101 * a sensible code size. 102 * Alan Cox : Self connect lockup fix. 103 * Alan Cox : No connect to multicast. 104 * Ross Biro : Close unaccepted children on master 105 * socket close. 106 * Alan Cox : Reset tracing code. 107 * Alan Cox : Spurious resets on shutdown. 108 * Alan Cox : Giant 15 minute/60 second timer error 109 * Alan Cox : Small whoops in selecting before an accept. 110 * Alan Cox : Kept the state trace facility since it's 111 * handy for debugging. 112 * Alan Cox : More reset handler fixes. 113 * Alan Cox : Started rewriting the code based on the RFC's 114 * for other useful protocol references see: 115 * Comer, KA9Q NOS, and for a reference on the 116 * difference between specifications and how BSD 117 * works see the 4.4lite source. 118 * A.N.Kuznetsov : Don't time wait on completion of tidy 119 * close. 120 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 121 * Linus Torvalds : Fixed BSD port reuse to work first syn 122 * Alan Cox : Reimplemented timers as per the RFC and using multiple 123 * timers for sanity. 124 * Alan Cox : Small bug fixes, and a lot of new 125 * comments. 126 * Alan Cox : Fixed dual reader crash by locking 127 * the buffers (much like datagram.c) 128 * Alan Cox : Fixed stuck sockets in probe. A probe 129 * now gets fed up of retrying without 130 * (even a no space) answer. 131 * Alan Cox : Extracted closing code better 132 * Alan Cox : Fixed the closing state machine to 133 * resemble the RFC. 134 * Alan Cox : More 'per spec' fixes. 135 * Alan Cox : tcp_data() doesn't ack illegal PSH 136 * only frames. At least one pc tcp stack 137 * generates them. 138 * 139 * 140 * To Fix: 141 * Fast path the code. Two things here - fix the window calculation 142 * so it doesn't iterate over the queue, also spot packets with no funny 143 * options arriving in order and process directly. 144 * 145 * Implement RFC 1191 [Path MTU discovery] 146 * Look at the effect of implementing RFC 1337 suggestions and their impact. 147 * Rewrite output state machine to use a single queue and do low window 148 * situations as per the spec (RFC 1122) 149 * Speed up input assembly algorithm. 150 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we 151 * could do with it working on IPv4 152 * User settable/learned rtt/max window/mtu 153 * Cope with MTU/device switches when retransmitting in tcp. 154 * Fix the window handling to use PR's new code. 155 * 156 * Change the fundamental structure to a single send queue maintained 157 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on 158 * active routes too]). Cut the queue off in tcp_retransmit/ 159 * tcp_transmit. 160 * Change the receive queue to assemble as it goes. This lets us 161 * dispose of most of tcp_sequence, half of tcp_ack and chunks of 162 * tcp_data/tcp_read as well as the window shrink crud. 163 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack 164 * tcp_queue_skb seem obvious routines to extract. 165 * 166 * This program is free software; you can redistribute it and/or 167 * modify it under the terms of the GNU General Public License 168 * as published by the Free Software Foundation; either version 169 * 2 of the License, or(at your option) any later version. 170 * 171 * Description of States: 172 * 173 * TCP_SYN_SENT sent a connection request, waiting for ack 174 * 175 * TCP_SYN_RECV received a connection request, sent ack, 176 * waiting for final ack in three-way handshake. 177 * 178 * TCP_ESTABLISHED connection established 179 * 180 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 181 * transmission of remaining buffered data 182 * 183 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 184 * to shutdown 185 * 186 * TCP_CLOSING both sides have shutdown but we still have 187 * data we have to finish sending 188 * 189 * TCP_TIME_WAIT timeout to catch resent junk before entering 190 * closed, can only be entered from FIN_WAIT2 191 * or CLOSING. Required because the other end 192 * may not have gotten our last ACK causing it 193 * to retransmit the data packet (which we ignore) 194 * 195 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 196 * us to finish writing our data and to shutdown 197 * (we have to close() to move on to LAST_ACK) 198 * 199 * TCP_LAST_ACK out side has shutdown after remote has 200 * shutdown. There may still be data in our 201 * buffer that we have to finish sending 202 * 203 * TCP_CLOSE socket is finished 204 */ 205
206 #include <linux/types.h>
207 #include <linux/sched.h>
208 #include <linux/mm.h>
209 #include <linux/time.h>
210 #include <linux/string.h>
211 #include <linux/config.h>
212 #include <linux/socket.h>
213 #include <linux/sockios.h>
214 #include <linux/termios.h>
215 #include <linux/in.h>
216 #include <linux/fcntl.h>
217 #include <linux/inet.h>
218 #include <linux/netdevice.h>
219 #include "snmp.h"
220 #include "ip.h"
221 #include "protocol.h"
222 #include "icmp.h"
223 #include "tcp.h"
224 #include "arp.h"
225 #include <linux/skbuff.h>
226 #include "sock.h"
227 #include "route.h"
228 #include <linux/errno.h>
229 #include <linux/timer.h>
230 #include <asm/system.h>
231 #include <asm/segment.h>
232 #include <linux/mm.h>
233
234 /* 235 * The MSL timer is the 'normal' timer. 236 */ 237
238 #definereset_msl_timer(x,y,z) reset_timer(x,y,z)
239
240 #define SEQ_TICK 3
241 unsignedlongseq_offset;
242 structtcp_mibtcp_statistics;
243
244 staticvoidtcp_close(structsock *sk, inttimeout);
245
246
247 /* 248 * The less said about this the better, but it works and will do for 1.2 249 */ 250
251 staticstructwait_queue *master_select_wakeup;
252
253 static__inline__intmin(unsignedinta, unsignedintb)
/* */ 254 { 255 if (a < b)
256 return(a);
257 return(b);
258 } 259
260 #undefSTATE_TRACE 261
262 #ifdefSTATE_TRACE 263 staticchar *statename[]={ 264 "Unused","Established","Syn Sent","Syn Recv",
265 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
266 "Close Wait","Last ACK","Listen","Closing"
267 };
268 #endif 269
270 static__inline__voidtcp_set_state(structsock *sk, intstate)
/* */ 271 { 272 if(sk->state==TCP_ESTABLISHED)
273 tcp_statistics.TcpCurrEstab--;
274 #ifdefSTATE_TRACE 275 if(sk->debug)
276 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
277 #endif 278 /* This is a hack but it doesn't occur often and it's going to 279 be a real to fix nicely */ 280
281 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
282 { 283 wake_up_interruptible(&master_select_wakeup);
284 } 285 sk->state=state;
286 if(state==TCP_ESTABLISHED)
287 tcp_statistics.TcpCurrEstab++;
288 } 289
290 /* 291 * This routine picks a TCP windows for a socket based on 292 * the following constraints 293 * 294 * 1. The window can never be shrunk once it is offered (RFC 793) 295 * 2. We limit memory per socket 296 * 297 * For now we use NET2E3's heuristic of offering half the memory 298 * we have handy. All is not as bad as this seems however because 299 * of two things. Firstly we will bin packets even within the window 300 * in order to get the data we are waiting for into the memory limit. 301 * Secondly we bin common duplicate forms at receive time 302 * Better heuristics welcome 303 */ 304
305 inttcp_select_window(structsock *sk)
/* */ 306 { 307 intnew_window = sk->prot->rspace(sk);
308
309 if(sk->window_clamp)
310 new_window=min(sk->window_clamp,new_window);
311 /* 312 * Two things are going on here. First, we don't ever offer a 313 * window less than min(sk->mss, MAX_WINDOW/2). This is the 314 * receiver side of SWS as specified in RFC1122. 315 * Second, we always give them at least the window they 316 * had before, in order to avoid retracting window. This 317 * is technically allowed, but RFC1122 advises against it and 318 * in practice it causes trouble. 319 * 320 * Fixme: This doesn't correctly handle the case where 321 * new_window > sk->window but not by enough to allow for the 322 * shift in sequence space. 323 */ 324 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
325 return(sk->window);
326 return(new_window);
327 } 328
329 /* 330 * Find someone to 'accept'. Must be called with 331 * sk->inuse=1 or cli() 332 */ 333
334 staticstructsk_buff *tcp_find_established(structsock *s)
/* */ 335 { 336 structsk_buff *p=skb_peek(&s->receive_queue);
337 if(p==NULL)
338 returnNULL;
339 do 340 { 341 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
342 returnp;
343 p=p->next;
344 } 345 while(p!=(structsk_buff *)&s->receive_queue);
346 returnNULL;
347 } 348
349 /* 350 * Remove a completed connection and return it. This is used by 351 * tcp_accept() to get connections from the queue. 352 */ 353
354 staticstructsk_buff *tcp_dequeue_established(structsock *s)
/* */ 355 { 356 structsk_buff *skb;
357 unsignedlongflags;
358 save_flags(flags);
359 cli();
360 skb=tcp_find_established(s);
361 if(skb!=NULL)
362 skb_unlink(skb); /* Take it off the queue */ 363 restore_flags(flags);
364 returnskb;
365 } 366
367 /* 368 * This routine closes sockets which have been at least partially 369 * opened, but not yet accepted. Currently it is only called by 370 * tcp_close, and timeout mirrors the value there. 371 */ 372
373 staticvoidtcp_close_pending (structsock *sk)
/* */ 374 { 375 structsk_buff *skb;
376
377 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
378 { 379 skb->sk->dead=1;
380 tcp_close(skb->sk, 0);
381 kfree_skb(skb, FREE_READ);
382 } 383 return;
384 } 385
386 /* 387 * Enter the time wait state. 388 */ 389
390 staticvoidtcp_time_wait(structsock *sk)
/* */ 391 { 392 tcp_set_state(sk,TCP_TIME_WAIT);
393 sk->shutdown = SHUTDOWN_MASK;
394 if (!sk->dead)
395 sk->state_change(sk);
396 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
397 } 398
399 /* 400 * A socket has timed out on its send queue and wants to do a 401 * little retransmitting. Currently this means TCP. 402 */ 403
404 voidtcp_do_retransmit(structsock *sk, intall)
/* */ 405 { 406 structsk_buff * skb;
407 structproto *prot;
408 structdevice *dev;
409 intct=0;
410
411 prot = sk->prot;
412 skb = sk->send_head;
413
414 while (skb != NULL)
415 { 416 structtcphdr *th;
417 structiphdr *iph;
418 intsize;
419
420 dev = skb->dev;
421 IS_SKB(skb);
422 skb->when = jiffies;
423
424 /* 425 * In general it's OK just to use the old packet. However we 426 * need to use the current ack and window fields. Urg and 427 * urg_ptr could possibly stand to be updated as well, but we 428 * don't keep the necessary data. That shouldn't be a problem, 429 * if the other end is doing the right thing. Since we're 430 * changing the packet, we have to issue a new IP identifier. 431 */ 432
433 iph = (structiphdr *)(skb->data + dev->hard_header_len);
434 th = (structtcphdr *)(((char *)iph) + (iph->ihl << 2));
435 size = skb->len - (((unsignedchar *) th) - skb->data);
436
437 /* 438 * Note: We ought to check for window limits here but 439 * currently this is done (less efficiently) elsewhere. 440 * We do need to check for a route change but can't handle 441 * that until we have the new 1.3.x buffers in. 442 * 443 */ 444
445 iph->id = htons(ip_id_count++);
446 ip_send_check(iph);
447
448 /* 449 * This is not the right way to handle this. We have to 450 * issue an up to date window and ack report with this 451 * retransmit to keep the odd buggy tcp that relies on 452 * the fact BSD does this happy. 453 * We don't however need to recalculate the entire 454 * checksum, so someone wanting a small problem to play 455 * with might like to implement RFC1141/RFC1624 and speed 456 * this up by avoiding a full checksum. 457 */ 458
459 th->ack_seq = ntohl(sk->acked_seq);
460 th->window = ntohs(tcp_select_window(sk));
461 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
462
463 /* 464 * If the interface is (still) up and running, kick it. 465 */ 466
467 if (dev->flags & IFF_UP)
468 { 469 /* 470 * If the packet is still being sent by the device/protocol 471 * below then don't retransmit. This is both needed, and good - 472 * especially with connected mode AX.25 where it stops resends 473 * occurring of an as yet unsent anyway frame! 474 * We still add up the counts as the round trip time wants 475 * adjusting. 476 */ 477 if (sk && !skb_device_locked(skb))
478 { 479 /* Remove it from any existing driver queue first! */ 480 skb_unlink(skb);
481 /* Now queue it */ 482 ip_statistics.IpOutRequests++;
483 dev_queue_xmit(skb, dev, sk->priority);
484 } 485 } 486
487 /* 488 * Count retransmissions 489 */ 490
491 ct++;
492 sk->prot->retransmits ++;
493
494 /* 495 * Only one retransmit requested. 496 */ 497
498 if (!all)
499 break;
500
501 /* 502 * This should cut it off before we send too many packets. 503 */ 504
505 if (ct >= sk->cong_window)
506 break;
507 skb = skb->link3;
508 } 509 } 510
511 /* 512 * Reset the retransmission timer 513 */ 514
515 staticvoidreset_xmit_timer(structsock *sk, intwhy, unsignedlongwhen)
/* */ 516 { 517 del_timer(&sk->retransmit_timer);
518 sk->ip_xmit_timeout = why;
519 if((int)when < 0)
520 { 521 when=3;
522 printk("Error: Negative timer in xmit_timer\n");
523 } 524 sk->retransmit_timer.expires=when;
525 add_timer(&sk->retransmit_timer);
526 } 527
528 /* 529 * This is the normal code called for timeouts. It does the retransmission 530 * and then does backoff. tcp_do_retransmit is separated out because 531 * tcp_ack needs to send stuff from the retransmit queue without 532 * initiating a backoff. 533 */ 534
535
536 voidtcp_retransmit_time(structsock *sk, intall)
/* */ 537 { 538 tcp_do_retransmit(sk, all);
539
540 /* 541 * Increase the timeout each time we retransmit. Note that 542 * we do not increase the rtt estimate. rto is initialized 543 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 544 * that doubling rto each time is the least we can get away with. 545 * In KA9Q, Karn uses this for the first few times, and then 546 * goes to quadratic. netBSD doubles, but only goes up to *64, 547 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 548 * defined in the protocol as the maximum possible RTT. I guess 549 * we'll have to use something other than TCP to talk to the 550 * University of Mars. 551 * 552 * PAWS allows us longer timeouts and large windows, so once 553 * implemented ftp to mars will work nicely. We will have to fix 554 * the 120 second clamps though! 555 */ 556
557 sk->retransmits++;
558 sk->backoff++;
559 sk->rto = min(sk->rto << 1, 120*HZ);
560 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
561 } 562
563
564 /* 565 * A timer event has trigger a tcp retransmit timeout. The 566 * socket xmit queue is ready and set up to send. Because 567 * the ack receive code keeps the queue straight we do 568 * nothing clever here. 569 */ 570
571 staticvoidtcp_retransmit(structsock *sk, intall)
/* */ 572 { 573 if (all)
574 { 575 tcp_retransmit_time(sk, all);
576 return;
577 } 578
579 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ 580 /* sk->ssthresh in theory can be zero. I guess that's OK */ 581 sk->cong_count = 0;
582
583 sk->cong_window = 1;
584
585 /* Do the actual retransmit. */ 586 tcp_retransmit_time(sk, all);
587 } 588
589 /* 590 * A write timeout has occurred. Process the after effects. 591 */ 592
593 staticinttcp_write_timeout(structsock *sk)
/* */ 594 { 595 /* 596 * Look for a 'soft' timeout. 597 */ 598 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
599 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
600 { 601 /* 602 * Attempt to recover if arp has changed (unlikely!) or 603 * a route has shifted (not supported prior to 1.3). 604 */ 605 arp_destroy (sk->daddr, 0);
606 ip_route_check (sk->daddr);
607 } 608 /* 609 * Has it gone just too far ? 610 */ 611 if (sk->retransmits > TCP_RETR2)
612 { 613 sk->err = ETIMEDOUT;
614 sk->error_report(sk);
615 del_timer(&sk->retransmit_timer);
616 /* 617 * Time wait the socket 618 */ 619 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
620 { 621 tcp_set_state(sk,TCP_TIME_WAIT);
622 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
623 } 624 else 625 { 626 /* 627 * Clean up time. 628 */ 629 tcp_set_state(sk, TCP_CLOSE);
630 return 0;
631 } 632 } 633 return 1;
634 } 635
636 /* 637 * The TCP retransmit timer. This lacks a few small details. 638 * 639 * 1. An initial rtt timeout on the probe0 should cause what we can 640 * of the first write queue buffer to be split and sent. 641 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 642 * ETIMEDOUT if we know an additional 'soft' error caused this. 643 * tcp_err should save a 'soft error' for us. 644 */ 645
646 staticvoidretransmit_timer(unsignedlongdata)
/* */ 647 { 648 structsock *sk = (structsock*)data;
649 intwhy = sk->ip_xmit_timeout;
650
651 /* 652 * only process if socket is not in use 653 */ 654
655 cli();
656 if (sk->inuse || in_bh)
657 { 658 /* Try again in 1 second */ 659 sk->retransmit_timer.expires = HZ;
660 add_timer(&sk->retransmit_timer);
661 sti();
662 return;
663 } 664
665 sk->inuse = 1;
666 sti();
667
668 /* Always see if we need to send an ack. */ 669
670 if (sk->ack_backlog && !sk->zapped)
671 { 672 sk->prot->read_wakeup (sk);
673 if (! sk->dead)
674 sk->data_ready(sk,0);
675 } 676
677 /* Now we need to figure out why the socket was on the timer. */ 678
679 switch (why)
680 { 681 /* Window probing */ 682 caseTIME_PROBE0:
683 tcp_send_probe0(sk);
684 tcp_write_timeout(sk);
685 break;
686 /* Retransmitting */ 687 caseTIME_WRITE:
688 /* It could be we got here because we needed to send an ack. 689 * So we need to check for that. 690 */ 691 { 692 structsk_buff *skb;
693 unsignedlongflags;
694
695 save_flags(flags);
696 cli();
697 skb = sk->send_head;
698 if (!skb)
699 { 700 restore_flags(flags);
701 } 702 else 703 { 704 /* 705 * Kicked by a delayed ack. Reset timer 706 * correctly now 707 */ 708 if (jiffies < skb->when + sk->rto)
709 { 710 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
711 restore_flags(flags);
712 break;
713 } 714 restore_flags(flags);
715 /* 716 * Retransmission 717 */ 718 sk->prot->retransmit (sk, 0);
719 tcp_write_timeout(sk);
720 } 721 break;
722 } 723 /* Sending Keepalives */ 724 caseTIME_KEEPOPEN:
725 /* 726 * this reset_timer() call is a hack, this is not 727 * how KEEPOPEN is supposed to work. 728 */ 729 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
730
731 /* Send something to keep the connection open. */ 732 if (sk->prot->write_wakeup)
733 sk->prot->write_wakeup (sk);
734 sk->retransmits++;
735 tcp_write_timeout(sk);
736 break;
737 default:
738 printk ("rexmit_timer: timer expired - reason unknown\n");
739 break;
740 } 741 release_sock(sk);
742 } 743
744 /* 745 * This routine is called by the ICMP module when it gets some 746 * sort of error condition. If err < 0 then the socket should 747 * be closed and the error returned to the user. If err > 0 748 * it's just the icmp type << 8 | icmp code. After adjustment 749 * header points to the first 8 bytes of the tcp header. We need 750 * to find the appropriate port. 751 */ 752
753 voidtcp_err(interr, unsignedchar *header, unsignedlongdaddr,
/* */ 754 unsignedlongsaddr, structinet_protocol *protocol)
755 { 756 structtcphdr *th;
757 structsock *sk;
758 structiphdr *iph=(structiphdr *)header;
759
760 header+=4*iph->ihl;
761
762
763 th =(structtcphdr *)header;
764 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
765
766 if (sk == NULL)
767 return;
768
769 if(err<0)
770 { 771 sk->err = -err;
772 sk->error_report(sk);
773 return;
774 } 775
776 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
777 { 778 /* 779 * FIXME: 780 * For now we will just trigger a linear backoff. 781 * The slow start code should cause a real backoff here. 782 */ 783 if (sk->cong_window > 4)
784 sk->cong_window--;
785 return;
786 } 787
788 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ 789
790 /* 791 * If we've already connected we will keep trying 792 * until we time out, or the user gives up. 793 */ 794
795 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
796 { 797 if (sk->state == TCP_SYN_SENT)
798 { 799 tcp_statistics.TcpAttemptFails++;
800 tcp_set_state(sk,TCP_CLOSE);
801 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 802 } 803 sk->err = icmp_err_convert[err & 0xff].errno;
804 } 805 return;
806 } 807
808
809 /* 810 * Walk down the receive queue counting readable data until we hit the end or we find a gap 811 * in the received data queue (ie a frame missing that needs sending to us). Not 812 * sorting using two queues as data arrives makes life so much harder. 813 */ 814
815 staticinttcp_readable(structsock *sk)
/* */ 816 { 817 unsignedlongcounted;
818 unsignedlongamount;
819 structsk_buff *skb;
820 intsum;
821 unsignedlongflags;
822
823 if(sk && sk->debug)
824 printk("tcp_readable: %p - ",sk);
825
826 save_flags(flags);
827 cli();
828 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
829 { 830 restore_flags(flags);
831 if(sk && sk->debug)
832 printk("empty\n");
833 return(0);
834 } 835
836 counted = sk->copied_seq; /* Where we are at the moment */ 837 amount = 0;
838
839 /* 840 * Do until a push or until we are out of data. 841 */ 842
843 do 844 { 845 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ 846 break;
847 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ 848 if (skb->h.th->syn)
849 sum++;
850 if (sum > 0)
851 {/* Add it up, move on */ 852 amount += sum;
853 if (skb->h.th->syn)
854 amount--;
855 counted += sum;
856 } 857 /* 858 * Don't count urg data ... but do it in the right place! 859 * Consider: "old_data (ptr is here) URG PUSH data" 860 * The old code would stop at the first push because 861 * it counted the urg (amount==1) and then does amount-- 862 * *after* the loop. This means tcp_readable() always 863 * returned zero if any URG PUSH was in the queue, even 864 * though there was normal data available. If we subtract 865 * the urg data right here, we even get it to work for more 866 * than one URG PUSH skb without normal data. 867 * This means that select() finally works now with urg data 868 * in the queue. Note that rlogin was never affected 869 * because it doesn't use select(); it uses two processes 870 * and a blocking read(). And the queue scan in tcp_read() 871 * was correct. Mike <pall@rz.uni-karlsruhe.de> 872 */ 873 if (skb->h.th->urg)
874 amount--; /* don't count urg data */ 875 if (amount && skb->h.th->psh) break;
876 skb = skb->next;
877 } 878 while(skb != (structsk_buff *)&sk->receive_queue);
879
880 restore_flags(flags);
881 if(sk->debug)
882 printk("got %lu bytes.\n",amount);
883 return(amount);
884 } 885
886 /* 887 * LISTEN is a special case for select.. 888 */ 889 staticinttcp_listen_select(structsock *sk, intsel_type, select_table *wait)
/* */ 890 { 891 if (sel_type == SEL_IN) { 892 intretval;
893
894 sk->inuse = 1;
895 retval = (tcp_find_established(sk) != NULL);
896 release_sock(sk);
897 if (!retval)
898 select_wait(&master_select_wakeup,wait);
899 returnretval;
900 } 901 return 0;
902 } 903
904
905 /* 906 * Wait for a TCP event. 907 * 908 * Note that we don't need to set "sk->inuse", as the upper select layers 909 * take care of normal races (between the test and the event) and we don't 910 * go look at any of the socket buffers directly. 911 */ 912 staticinttcp_select(structsock *sk, intsel_type, select_table *wait)
/* */ 913 { 914 if (sk->state == TCP_LISTEN)
915 returntcp_listen_select(sk, sel_type, wait);
916
917 switch(sel_type) { 918 caseSEL_IN:
919 if (sk->err)
920 return 1;
921 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
922 break;
923
924 if (sk->shutdown & RCV_SHUTDOWN)
925 return 1;
926
927 if (sk->acked_seq == sk->copied_seq)
928 break;
929
930 if (sk->urg_seq != sk->copied_seq ||
931 sk->acked_seq != sk->copied_seq+1 ||
932 sk->urginline || !sk->urg_data)
933 return 1;
934 break;
935
936 caseSEL_OUT:
937 if (sk->err)
938 return 1;
939 if (sk->shutdown & SEND_SHUTDOWN)
940 return 0;
941 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
942 break;
943 /* 944 * This is now right thanks to a small fix 945 * by Matt Dillon. 946 */ 947
948 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
949 break;
950 return 1;
951
952 caseSEL_EX:
953 if (sk->urg_data)
954 return 1;
955 break;
956 } 957 select_wait(sk->sleep, wait);
958 return 0;
959 } 960
961 inttcp_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 962 { 963 interr;
964 switch(cmd)
965 { 966
967 caseTIOCINQ:
968 #ifdef FIXME /* FIXME: */ 969 caseFIONREAD:
970 #endif 971 { 972 unsignedlongamount;
973
974 if (sk->state == TCP_LISTEN)
975 return(-EINVAL);
976
977 sk->inuse = 1;
978 amount = tcp_readable(sk);
979 release_sock(sk);
980 err=verify_area(VERIFY_WRITE,(void *)arg,
981 sizeof(unsignedlong));
982 if(err)
983 returnerr;
984 put_fs_long(amount,(unsignedlong *)arg);
985 return(0);
986 } 987 caseSIOCATMARK:
988 { 989 intansw = sk->urg_data && sk->urg_seq == sk->copied_seq;
990
991 err = verify_area(VERIFY_WRITE,(void *) arg,
992 sizeof(unsignedlong));
993 if (err)
994 returnerr;
995 put_fs_long(answ,(int *) arg);
996 return(0);
997 } 998 caseTIOCOUTQ:
999 {1000 unsignedlongamount;
1001
1002 if (sk->state == TCP_LISTEN) return(-EINVAL);
1003 amount = sk->prot->wspace(sk);
1004 err=verify_area(VERIFY_WRITE,(void *)arg,
1005 sizeof(unsignedlong));
1006 if(err)
1007 returnerr;
1008 put_fs_long(amount,(unsignedlong *)arg);
1009 return(0);
1010 }1011 default:
1012 return(-EINVAL);
1013 }1014 }1015
1016
1017 /*1018 * This routine computes a TCP checksum. 1019 */1020
1021 unsignedshorttcp_check(structtcphdr *th, intlen,
/* */1022 unsignedlongsaddr, unsignedlongdaddr)
1023 {1024 unsignedlongsum;
1025
1026 if (saddr == 0) saddr = ip_my_addr();
1027
1028 /*1029 * stupid, gcc complains when I use just one __asm__ block,1030 * something about too many reloads, but this is just two1031 * instructions longer than what I want1032 */1033 __asm__("
1034 addl %%ecx, %%ebx
1035 adcl %%edx, %%ebx
1036 adcl $0, %%ebx
1037 "
1038 : "=b"(sum)
1039 : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1040 : "bx", "cx", "dx" );
1041 __asm__("
1042 movl %%ecx, %%edx
1043 cld
1044 cmpl $32, %%ecx
1045 jb 2f
1046 shrl $5, %%ecx
1047 clc
1048 1: lodsl
1049 adcl %%eax, %%ebx
1050 lodsl
1051 adcl %%eax, %%ebx
1052 lodsl
1053 adcl %%eax, %%ebx
1054 lodsl
1055 adcl %%eax, %%ebx
1056 lodsl
1057 adcl %%eax, %%ebx
1058 lodsl
1059 adcl %%eax, %%ebx
1060 lodsl
1061 adcl %%eax, %%ebx
1062 lodsl
1063 adcl %%eax, %%ebx
1064 loop 1b
1065 adcl $0, %%ebx
1066 movl %%edx, %%ecx
1067 2: andl $28, %%ecx
1068 je 4f
1069 shrl $2, %%ecx
1070 clc
1071 3: lodsl
1072 adcl %%eax, %%ebx
1073 loop 3b
1074 adcl $0, %%ebx
1075 4: movl $0, %%eax
1076 testw $2, %%dx
1077 je 5f
1078 lodsw
1079 addl %%eax, %%ebx
1080 adcl $0, %%ebx
1081 movw $0, %%ax
1082 5: test $1, %%edx
1083 je 6f
1084 lodsb
1085 addl %%eax, %%ebx
1086 adcl $0, %%ebx
1087 6: movl %%ebx, %%eax
1088 shrl $16, %%eax
1089 addw %%ax, %%bx
1090 adcw $0, %%bx
1091 "
1092 : "=b"(sum)
1093 : "0"(sum), "c"(len), "S"(th)
1094 : "ax", "bx", "cx", "dx", "si" );
1095
1096 /* We only want the bottom 16 bits, but we never cleared the top 16. */1097
1098 return((~sum) & 0xffff);
1099 }1100
1101
1102
1103 voidtcp_send_check(structtcphdr *th, unsignedlongsaddr,
/* */1104 unsignedlongdaddr, intlen, structsock *sk)
1105 {1106 th->check = 0;
1107 th->check = tcp_check(th, len, saddr, daddr);
1108 return;
1109 }1110
1111 /*1112 * This is the main buffer sending routine. We queue the buffer1113 * having checked it is sane seeming.1114 */1115
1116 staticvoidtcp_send_skb(structsock *sk, structsk_buff *skb)
/* */1117 {1118 intsize;
1119 structtcphdr * th = skb->h.th;
1120
1121 /*1122 * length of packet (not counting length of pre-tcp headers) 1123 */1124
1125 size = skb->len - ((unsignedchar *) th - skb->data);
1126
1127 /*1128 * Sanity check it.. 1129 */1130
1131 if (size < sizeof(structtcphdr) || size > skb->len)
1132 {1133 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1134 skb, skb->data, th, skb->len);
1135 kfree_skb(skb, FREE_WRITE);
1136 return;
1137 }1138
1139 /*1140 * If we have queued a header size packet.. (these crash a few1141 * tcp stacks if ack is not set)1142 */1143
1144 if (size == sizeof(structtcphdr))
1145 {1146 /* If it's got a syn or fin it's notionally included in the size..*/1147 if(!th->syn && !th->fin)
1148 {1149 printk("tcp_send_skb: attempt to queue a bogon.\n");
1150 kfree_skb(skb,FREE_WRITE);
1151 return;
1152 }1153 }1154
1155 /*1156 * Actual processing.1157 */1158
1159 tcp_statistics.TcpOutSegs++;
1160 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1161
1162 /*1163 * We must queue if1164 *1165 * a) The right edge of this frame exceeds the window1166 * b) We are retransmitting (Nagle's rule)1167 * c) We have too many packets 'in flight'1168 */1169
1170 if (after(skb->h.seq, sk->window_seq) ||
1171 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1172 sk->packets_out >= sk->cong_window)
1173 {1174 /* checksum will be supplied by tcp_write_xmit. So1175 * we shouldn't need to set it at all. I'm being paranoid */1176 th->check = 0;
1177 if (skb->next != NULL)
1178 {1179 printk("tcp_send_partial: next != NULL\n");
1180 skb_unlink(skb);
1181 }1182 skb_queue_tail(&sk->write_queue, skb);
1183
1184 /*1185 * If we don't fit we have to start the zero window1186 * probes. This is broken - we really need to do a partial1187 * send _first_ (This is what causes the Cisco and PC/TCP1188 * grief).1189 */1190
1191 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1192 sk->send_head == NULL && sk->ack_backlog == 0)
1193 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1194 }1195 else1196 {1197 /*1198 * This is going straight out1199 */1200
1201 th->ack_seq = ntohl(sk->acked_seq);
1202 th->window = ntohs(tcp_select_window(sk));
1203
1204 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1205
1206 sk->sent_seq = sk->write_seq;
1207
1208 /*1209 * This is mad. The tcp retransmit queue is put together1210 * by the ip layer. This causes half the problems with1211 * unroutable FIN's and other things.1212 */1213
1214 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1215
1216 /*1217 * Set for next retransmit based on expected ACK time.1218 * FIXME: We set this every time which means our 1219 * retransmits are really about a window behind.1220 */1221
1222 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1223 }1224 }1225
1226 /*1227 * Locking problems lead us to a messy situation where we can have1228 * multiple partially complete buffers queued up. This is really bad1229 * as we don't want to be sending partial buffers. Fix this with1230 * a semaphore or similar to lock tcp_write per socket.1231 *1232 * These routines are pretty self descriptive.1233 */1234
1235 structsk_buff * tcp_dequeue_partial(structsock * sk)
/* */1236 {1237 structsk_buff * skb;
1238 unsignedlongflags;
1239
1240 save_flags(flags);
1241 cli();
1242 skb = sk->partial;
1243 if (skb) {1244 sk->partial = NULL;
1245 del_timer(&sk->partial_timer);
1246 }1247 restore_flags(flags);
1248 returnskb;
1249 }1250
1251 /*1252 * Empty the partial queue1253 */1254
1255 staticvoidtcp_send_partial(structsock *sk)
/* */1256 {1257 structsk_buff *skb;
1258
1259 if (sk == NULL)
1260 return;
1261 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1262 tcp_send_skb(sk, skb);
1263 }1264
1265 /*1266 * Queue a partial frame1267 */1268
1269 voidtcp_enqueue_partial(structsk_buff * skb, structsock * sk)
/* */1270 {1271 structsk_buff * tmp;
1272 unsignedlongflags;
1273
1274 save_flags(flags);
1275 cli();
1276 tmp = sk->partial;
1277 if (tmp)
1278 del_timer(&sk->partial_timer);
1279 sk->partial = skb;
1280 init_timer(&sk->partial_timer);
1281 /*1282 * Wait up to 1 second for the buffer to fill.1283 */1284 sk->partial_timer.expires = HZ;
1285 sk->partial_timer.function = (void (*)(unsignedlong)) tcp_send_partial;
1286 sk->partial_timer.data = (unsignedlong) sk;
1287 add_timer(&sk->partial_timer);
1288 restore_flags(flags);
1289 if (tmp)
1290 tcp_send_skb(sk, tmp);
1291 }1292
1293
1294 /*1295 * This routine sends an ack and also updates the window. 1296 */1297
1298 staticvoidtcp_send_ack(unsignedlongsequence, unsignedlongack,
/* */1299 structsock *sk,
1300 structtcphdr *th, unsignedlongdaddr)
1301 {1302 structsk_buff *buff;
1303 structtcphdr *t1;
1304 structdevice *dev = NULL;
1305 inttmp;
1306
1307 if(sk->zapped)
1308 return; /* We have been reset, we may not send again */1309
1310 /*1311 * We need to grab some memory, and put together an ack,1312 * and then put it into the queue to be sent.1313 */1314
1315 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1316 if (buff == NULL)
1317 {1318 /* 1319 * Force it to send an ack. We don't have to do this1320 * (ACK is unreliable) but it's much better use of 1321 * bandwidth on slow links to send a spare ack than1322 * resend packets. 1323 */1324
1325 sk->ack_backlog++;
1326 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1327 {1328 reset_xmit_timer(sk, TIME_WRITE, HZ);
1329 }1330 return;
1331 }1332
1333 /*1334 * Assemble a suitable TCP frame1335 */1336
1337 buff->len = sizeof(structtcphdr);
1338 buff->sk = sk;
1339 buff->localroute = sk->localroute;
1340 t1 =(structtcphdr *) buff->data;
1341
1342 /* 1343 * Put in the IP header and routing stuff. 1344 */1345
1346 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1347 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1348 if (tmp < 0)
1349 {1350 buff->free = 1;
1351 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1352 return;
1353 }1354 buff->len += tmp;
1355 t1 =(structtcphdr *)((char *)t1 +tmp);
1356
1357 memcpy(t1, th, sizeof(*t1));
1358
1359 /*1360 * Swap the send and the receive. 1361 */1362
1363 t1->dest = th->source;
1364 t1->source = th->dest;
1365 t1->seq = ntohl(sequence);
1366 t1->ack = 1;
1367 sk->window = tcp_select_window(sk);
1368 t1->window = ntohs(sk->window);
1369 t1->res1 = 0;
1370 t1->res2 = 0;
1371 t1->rst = 0;
1372 t1->urg = 0;
1373 t1->syn = 0;
1374 t1->psh = 0;
1375 t1->fin = 0;
1376
1377 /*1378 * If we have nothing queued for transmit and the transmit timer1379 * is on we are just doing an ACK timeout and need to switch1380 * to a keepalive.1381 */1382
1383 if (ack == sk->acked_seq)
1384 {1385 sk->ack_backlog = 0;
1386 sk->bytes_rcv = 0;
1387 sk->ack_timed = 0;
1388 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL1389 && sk->ip_xmit_timeout == TIME_WRITE)
1390 {1391 if(sk->keepopen) {1392 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1393 }else{1394 delete_timer(sk);
1395 }1396 }1397 }1398
1399 /*1400 * Fill in the packet and send it1401 */1402
1403 t1->ack_seq = ntohl(ack);
1404 t1->doff = sizeof(*t1)/4;
1405 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1406 if (sk->debug)
1407 printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1408 tcp_statistics.TcpOutSegs++;
1409 sk->prot->queue_xmit(sk, dev, buff, 1);
1410 }1411
1412
1413 /* 1414 * This routine builds a generic TCP header. 1415 */1416
1417 extern__inlineinttcp_build_header(structtcphdr *th, structsock *sk, intpush)
/* */1418 {1419
1420 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1421 th->seq = htonl(sk->write_seq);
1422 th->psh =(push == 0) ? 1 : 0;
1423 th->doff = sizeof(*th)/4;
1424 th->ack = 1;
1425 th->fin = 0;
1426 sk->ack_backlog = 0;
1427 sk->bytes_rcv = 0;
1428 sk->ack_timed = 0;
1429 th->ack_seq = htonl(sk->acked_seq);
1430 sk->window = tcp_select_window(sk);
1431 th->window = htons(sk->window);
1432
1433 return(sizeof(*th));
1434 }1435
1436 /*1437 * This routine copies from a user buffer into a socket,1438 * and starts the transmit system.1439 */1440
1441 staticinttcp_write(structsock *sk, unsignedchar *from,
/* */1442 intlen, intnonblock, unsignedflags)
1443 {1444 intcopied = 0;
1445 intcopy;
1446 inttmp;
1447 structsk_buff *skb;
1448 structsk_buff *send_tmp;
1449 unsignedchar *buff;
1450 structproto *prot;
1451 structdevice *dev = NULL;
1452
1453 sk->inuse=1;
1454 prot = sk->prot;
1455 while(len > 0)
1456 {1457 if (sk->err)
1458 {/* Stop on an error */1459 release_sock(sk);
1460 if (copied)
1461 return(copied);
1462 tmp = -sk->err;
1463 sk->err = 0;
1464 return(tmp);
1465 }1466
1467 /*1468 * First thing we do is make sure that we are established. 1469 */1470
1471 if (sk->shutdown & SEND_SHUTDOWN)
1472 {1473 release_sock(sk);
1474 sk->err = EPIPE;
1475 if (copied)
1476 return(copied);
1477 sk->err = 0;
1478 return(-EPIPE);
1479 }1480
1481 /* 1482 * Wait for a connection to finish.1483 */1484
1485 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1486 {1487 if (sk->err)
1488 {1489 release_sock(sk);
1490 if (copied)
1491 return(copied);
1492 tmp = -sk->err;
1493 sk->err = 0;
1494 return(tmp);
1495 }1496
1497 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1498 {1499 release_sock(sk);
1500 if (copied)
1501 return(copied);
1502
1503 if (sk->err)
1504 {1505 tmp = -sk->err;
1506 sk->err = 0;
1507 return(tmp);
1508 }1509
1510 if (sk->keepopen)
1511 {1512 send_sig(SIGPIPE, current, 0);
1513 }1514 return(-EPIPE);
1515 }1516
1517 if (nonblock || copied)
1518 {1519 release_sock(sk);
1520 if (copied)
1521 return(copied);
1522 return(-EAGAIN);
1523 }1524
1525 release_sock(sk);
1526 cli();
1527
1528 if (sk->state != TCP_ESTABLISHED &&
1529 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1530 {1531 interruptible_sleep_on(sk->sleep);
1532 if (current->signal & ~current->blocked)
1533 {1534 sti();
1535 if (copied)
1536 return(copied);
1537 return(-ERESTARTSYS);
1538 }1539 }1540 sk->inuse = 1;
1541 sti();
1542 }1543
1544 /*1545 * The following code can result in copy <= if sk->mss is ever1546 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).1547 * sk->mtu is constant once SYN processing is finished. I.e. we1548 * had better not get here until we've seen his SYN and at least one1549 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)1550 * But ESTABLISHED should guarantee that. sk->max_window is by definition1551 * non-decreasing. Note that any ioctl to set user_mss must be done1552 * before the exchange of SYN's. If the initial ack from the other1553 * end has a window of 0, max_window and thus mss will both be 0.1554 */1555
1556 /* 1557 * Now we need to check if we have a half built packet. 1558 */1559
1560 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1561 {1562 inthdrlen;
1563
1564 /* IP header + TCP header */1565 hdrlen = ((unsignedlong)skb->h.th - (unsignedlong)skb->data)
1566 + sizeof(structtcphdr);
1567
1568 /* Add more stuff to the end of skb->len */1569 if (!(flags & MSG_OOB))
1570 {1571 copy = min(sk->mss - (skb->len - hdrlen), len);
1572 /* FIXME: this is really a bug. */1573 if (copy <= 0)
1574 {1575 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1576 copy = 0;
1577 }1578
1579 memcpy_fromfs(skb->data + skb->len, from, copy);
1580 skb->len += copy;
1581 from += copy;
1582 copied += copy;
1583 len -= copy;
1584 sk->write_seq += copy;
1585 }1586 if ((skb->len - hdrlen) >= sk->mss ||
1587 (flags & MSG_OOB) || !sk->packets_out)
1588 tcp_send_skb(sk, skb);
1589 else1590 tcp_enqueue_partial(skb, sk);
1591 continue;
1592 }1593
1594 /*1595 * We also need to worry about the window.1596 * If window < 1/2 the maximum window we've seen from this1597 * host, don't use it. This is sender side1598 * silly window prevention, as specified in RFC1122.1599 * (Note that this is different than earlier versions of1600 * SWS prevention, e.g. RFC813.). What we actually do is 1601 * use the whole MSS. Since the results in the right1602 * edge of the packet being outside the window, it will1603 * be queued for later rather than sent.1604 */1605
1606 copy = sk->window_seq - sk->write_seq;
1607 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1608 copy = sk->mss;
1609 if (copy > len)
1610 copy = len;
1611
1612 /*1613 * We should really check the window here also. 1614 */1615
1616 send_tmp = NULL;
1617 if (copy < sk->mss && !(flags & MSG_OOB))
1618 {1619 /*1620 * We will release the socket in case we sleep here. 1621 */1622 release_sock(sk);
1623 /*1624 * NB: following must be mtu, because mss can be increased.1625 * mss is always <= mtu 1626 */1627 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1628 sk->inuse = 1;
1629 send_tmp = skb;
1630 }1631 else1632 {1633 /*1634 * We will release the socket in case we sleep here. 1635 */1636 release_sock(sk);
1637 skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1638 sk->inuse = 1;
1639 }1640
1641 /*1642 * If we didn't get any memory, we need to sleep. 1643 */1644
1645 if (skb == NULL)
1646 {1647 sk->socket->flags |= SO_NOSPACE;
1648 if (nonblock)
1649 {1650 release_sock(sk);
1651 if (copied)
1652 return(copied);
1653 return(-EAGAIN);
1654 }1655
1656 /*1657 * FIXME: here is another race condition. 1658 */1659
1660 tmp = sk->wmem_alloc;
1661 release_sock(sk);
1662 cli();
1663 /*1664 * Again we will try to avoid it. 1665 */1666 if (tmp <= sk->wmem_alloc &&
1667 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1668 && sk->err == 0)
1669 {1670 sk->socket->flags &= ~SO_NOSPACE;
1671 interruptible_sleep_on(sk->sleep);
1672 if (current->signal & ~current->blocked)
1673 {1674 sti();
1675 if (copied)
1676 return(copied);
1677 return(-ERESTARTSYS);
1678 }1679 }1680 sk->inuse = 1;
1681 sti();
1682 continue;
1683 }1684
1685 skb->len = 0;
1686 skb->sk = sk;
1687 skb->free = 0;
1688 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1689
1690 buff = skb->data;
1691
1692 /*1693 * FIXME: we need to optimize this.1694 * Perhaps some hints here would be good.1695 */1696
1697 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1698 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1699 if (tmp < 0 )
1700 {1701 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1702 release_sock(sk);
1703 if (copied)
1704 return(copied);
1705 return(tmp);
1706 }1707 skb->len += tmp;
1708 skb->dev = dev;
1709 buff += tmp;
1710 skb->h.th =(structtcphdr *) buff;
1711 tmp = tcp_build_header((structtcphdr *)buff, sk, len-copy);
1712 if (tmp < 0)
1713 {1714 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1715 release_sock(sk);
1716 if (copied)
1717 return(copied);
1718 return(tmp);
1719 }1720
1721 if (flags & MSG_OOB)
1722 {1723 ((structtcphdr *)buff)->urg = 1;
1724 ((structtcphdr *)buff)->urg_ptr = ntohs(copy);
1725 }1726 skb->len += tmp;
1727 memcpy_fromfs(buff+tmp, from, copy);
1728
1729 from += copy;
1730 copied += copy;
1731 len -= copy;
1732 skb->len += copy;
1733 skb->free = 0;
1734 sk->write_seq += copy;
1735
1736 if (send_tmp != NULL && sk->packets_out)
1737 {1738 tcp_enqueue_partial(send_tmp, sk);
1739 continue;
1740 }1741 tcp_send_skb(sk, skb);
1742 }1743 sk->err = 0;
1744
1745 /*1746 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly1747 * interactive fast network servers. It's meant to be on and1748 * it really improves the throughput though not the echo time1749 * on my slow slip link - Alan1750 */1751
1752 /*1753 * Avoid possible race on send_tmp - c/o Johannes Stille 1754 */1755
1756 if(sk->partial && ((!sk->packets_out)
1757 /* If not nagling we can send on the before case too.. */1758 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1759 ))
1760 tcp_send_partial(sk);
1761
1762 release_sock(sk);
1763 return(copied);
1764 }1765
1766 /*1767 * This is just a wrapper. 1768 */1769
1770 staticinttcp_sendto(structsock *sk, unsignedchar *from,
/* */1771 intlen, intnonblock, unsignedflags,
1772 structsockaddr_in *addr, intaddr_len)
1773 {1774 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1775 return -EINVAL;
1776 if (sk->state == TCP_CLOSE)
1777 return -ENOTCONN;
1778 if (addr_len < sizeof(*addr))
1779 return -EINVAL;
1780 if (addr->sin_family && addr->sin_family != AF_INET)
1781 return -EINVAL;
1782 if (addr->sin_port != sk->dummy_th.dest)
1783 return -EISCONN;
1784 if (addr->sin_addr.s_addr != sk->daddr)
1785 return -EISCONN;
1786 returntcp_write(sk, from, len, nonblock, flags);
1787 }1788
1789
1790 /*1791 * Send an ack if one is backlogged at this point. Ought to merge1792 * this with tcp_send_ack().1793 */1794
1795 staticvoidtcp_read_wakeup(structsock *sk)
/* */1796 {1797 inttmp;
1798 structdevice *dev = NULL;
1799 structtcphdr *t1;
1800 structsk_buff *buff;
1801
1802 if (!sk->ack_backlog)
1803 return;
1804
1805 /*1806 * FIXME: we need to put code here to prevent this routine from1807 * being called. Being called once in a while is ok, so only check1808 * if this is the second time in a row.1809 */1810
1811 /*1812 * We need to grab some memory, and put together an ack,1813 * and then put it into the queue to be sent.1814 */1815
1816 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1817 if (buff == NULL)
1818 {1819 /* Try again real soon. */1820 reset_xmit_timer(sk, TIME_WRITE, HZ);
1821 return;
1822 }1823
1824 buff->len = sizeof(structtcphdr);
1825 buff->sk = sk;
1826 buff->localroute = sk->localroute;
1827
1828 /*1829 * Put in the IP header and routing stuff. 1830 */1831
1832 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1833 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1834 if (tmp < 0)
1835 {1836 buff->free = 1;
1837 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1838 return;
1839 }1840
1841 buff->len += tmp;
1842 t1 =(structtcphdr *)(buff->data +tmp);
1843
1844 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1845 t1->seq = htonl(sk->sent_seq);
1846 t1->ack = 1;
1847 t1->res1 = 0;
1848 t1->res2 = 0;
1849 t1->rst = 0;
1850 t1->urg = 0;
1851 t1->syn = 0;
1852 t1->psh = 0;
1853 sk->ack_backlog = 0;
1854 sk->bytes_rcv = 0;
1855 sk->window = tcp_select_window(sk);
1856 t1->window = ntohs(sk->window);
1857 t1->ack_seq = ntohl(sk->acked_seq);
1858 t1->doff = sizeof(*t1)/4;
1859 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1860 sk->prot->queue_xmit(sk, dev, buff, 1);
1861 tcp_statistics.TcpOutSegs++;
1862 }1863
1864
1865 /*1866 * FIXME:1867 * This routine frees used buffers.1868 * It should consider sending an ACK to let the1869 * other end know we now have a bigger window.1870 */1871
1872 staticvoidcleanup_rbuf(structsock *sk)
/* */1873 {1874 unsignedlongflags;
1875 unsignedlongleft;
1876 structsk_buff *skb;
1877 unsignedlongrspace;
1878
1879 if(sk->debug)
1880 printk("cleaning rbuf for sk=%p\n", sk);
1881
1882 save_flags(flags);
1883 cli();
1884
1885 left = sk->prot->rspace(sk);
1886
1887 /*1888 * We have to loop through all the buffer headers,1889 * and try to free up all the space we can.1890 */1891
1892 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1893 {1894 if (!skb->used || skb->users)
1895 break;
1896 skb_unlink(skb);
1897 skb->sk = sk;
1898 kfree_skb(skb, FREE_READ);
1899 }1900
1901 restore_flags(flags);
1902
1903 /*1904 * FIXME:1905 * At this point we should send an ack if the difference1906 * in the window, and the amount of space is bigger than1907 * TCP_WINDOW_DIFF.1908 */1909
1910 if(sk->debug)
1911 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1912 left);
1913 if ((rspace=sk->prot->rspace(sk)) != left)
1914 {1915 /*1916 * This area has caused the most trouble. The current strategy1917 * is to simply do nothing if the other end has room to send at1918 * least 3 full packets, because the ack from those will auto-1919 * matically update the window. If the other end doesn't think1920 * we have much space left, but we have room for at least 1 more1921 * complete packet than it thinks we do, we will send an ack1922 * immediately. Otherwise we will wait up to .5 seconds in case1923 * the user reads some more.1924 */1925 sk->ack_backlog++;
1926 /*1927 * It's unclear whether to use sk->mtu or sk->mss here. They differ only1928 * if the other end is offering a window smaller than the agreed on MSS1929 * (called sk->mtu here). In theory there's no connection between send1930 * and receive, and so no reason to think that they're going to send1931 * small packets. For the moment I'm using the hack of reducing the mss1932 * only on the send side, so I'm putting mtu here.1933 */1934
1935 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1936 {1937 /* Send an ack right now. */1938 tcp_read_wakeup(sk);
1939 }1940 else1941 {1942 /* Force it to send an ack soon. */1943 intwas_active = del_timer(&sk->retransmit_timer);
1944 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1945 {1946 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1947 }1948 else1949 add_timer(&sk->retransmit_timer);
1950 }1951 }1952 }1953
1954
1955 /*1956 * Handle reading urgent data. BSD has very simple semantics for1957 * this, no blocking and very strange errors 8)1958 */1959
1960 staticinttcp_read_urg(structsock * sk, intnonblock,
/* */1961 unsignedchar *to, intlen, unsignedflags)
1962 {1963 /*1964 * No URG data to read1965 */1966 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1967 return -EINVAL; /* Yes this is right ! */1968
1969 if (sk->err)
1970 {1971 inttmp = -sk->err;
1972 sk->err = 0;
1973 returntmp;
1974 }1975
1976 if (sk->state == TCP_CLOSE || sk->done)
1977 {1978 if (!sk->done) {1979 sk->done = 1;
1980 return 0;
1981 }1982 return -ENOTCONN;
1983 }1984
1985 if (sk->shutdown & RCV_SHUTDOWN)
1986 {1987 sk->done = 1;
1988 return 0;
1989 }1990 sk->inuse = 1;
1991 if (sk->urg_data & URG_VALID)
1992 {1993 charc = sk->urg_data;
1994 if (!(flags & MSG_PEEK))
1995 sk->urg_data = URG_READ;
1996 put_fs_byte(c, to);
1997 release_sock(sk);
1998 return 1;
1999 }2000 release_sock(sk);
2001
2002 /*2003 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and2004 * the available implementations agree in this case:2005 * this call should never block, independent of the2006 * blocking state of the socket.2007 * Mike <pall@rz.uni-karlsruhe.de>2008 */2009 return -EAGAIN;
2010 }2011
2012
2013 /*2014 * This routine copies from a sock struct into the user buffer. 2015 */2016
2017 staticinttcp_read(structsock *sk, unsignedchar *to,
/* */2018 intlen, intnonblock, unsignedflags)
2019 {2020 structwait_queuewait = {current, NULL};
2021 intcopied = 0;
2022 unsignedlongpeek_seq;
2023 volatileunsignedlong *seq; /* So gcc doesn't overoptimise */2024 unsignedlongused;
2025
2026 /* 2027 * This error should be checked. 2028 */2029
2030 if (sk->state == TCP_LISTEN)
2031 return -ENOTCONN;
2032
2033 /*2034 * Urgent data needs to be handled specially. 2035 */2036
2037 if (flags & MSG_OOB)
2038 returntcp_read_urg(sk, nonblock, to, len, flags);
2039
2040 /*2041 * Copying sequence to update. This is volatile to handle2042 * the multi-reader case neatly (memcpy_to/fromfs might be 2043 * inline and thus not flush cached variables otherwise).2044 */2045
2046 peek_seq = sk->copied_seq;
2047 seq = &sk->copied_seq;
2048 if (flags & MSG_PEEK)
2049 seq = &peek_seq;
2050
2051 add_wait_queue(sk->sleep, &wait);
2052 sk->inuse = 1;
2053 while (len > 0)
2054 {2055 structsk_buff * skb;
2056 unsignedlongoffset;
2057
2058 /*2059 * Are we at urgent data? Stop if we have read anything.2060 */2061
2062 if (copied && sk->urg_data && sk->urg_seq == *seq)
2063 break;
2064
2065 /*2066 * Next get a buffer.2067 */2068
2069 current->state = TASK_INTERRUPTIBLE;
2070
2071 skb = skb_peek(&sk->receive_queue);
2072 do2073 {2074 if (!skb)
2075 break;
2076 if (before(*seq, skb->h.th->seq))
2077 break;
2078 offset = *seq - skb->h.th->seq;
2079 if (skb->h.th->syn)
2080 offset--;
2081 if (offset < skb->len)
2082 gotofound_ok_skb;
2083 if (skb->h.th->fin)
2084 gotofound_fin_ok;
2085 if (!(flags & MSG_PEEK))
2086 skb->used = 1;
2087 skb = skb->next;
2088 }2089 while (skb != (structsk_buff *)&sk->receive_queue);
2090
2091 if (copied)
2092 break;
2093
2094 if (sk->err)
2095 {2096 copied = -sk->err;
2097 sk->err = 0;
2098 break;
2099 }2100
2101 if (sk->state == TCP_CLOSE)
2102 {2103 if (!sk->done)
2104 {2105 sk->done = 1;
2106 break;
2107 }2108 copied = -ENOTCONN;
2109 break;
2110 }2111
2112 if (sk->shutdown & RCV_SHUTDOWN)
2113 {2114 sk->done = 1;
2115 break;
2116 }2117
2118 if (nonblock)
2119 {2120 copied = -EAGAIN;
2121 break;
2122 }2123
2124 cleanup_rbuf(sk);
2125 release_sock(sk);
2126 sk->socket->flags |= SO_WAITDATA;
2127 schedule();
2128 sk->socket->flags &= ~SO_WAITDATA;
2129 sk->inuse = 1;
2130
2131 if (current->signal & ~current->blocked)
2132 {2133 copied = -ERESTARTSYS;
2134 break;
2135 }2136 continue;
2137
2138 found_ok_skb:
2139 /*2140 * Lock the buffer. We can be fairly relaxed as2141 * an interrupt will never steal a buffer we are 2142 * using unless I've missed something serious in2143 * tcp_data.2144 */2145
2146 skb->users++;
2147
2148 /*2149 * Ok so how much can we use ? 2150 */2151
2152 used = skb->len - offset;
2153 if (len < used)
2154 used = len;
2155 /*2156 * Do we have urgent data here? 2157 */2158
2159 if (sk->urg_data)
2160 {2161 unsignedlongurg_offset = sk->urg_seq - *seq;
2162 if (urg_offset < used)
2163 {2164 if (!urg_offset)
2165 {2166 if (!sk->urginline)
2167 {2168 ++*seq;
2169 offset++;
2170 used--;
2171 }2172 }2173 else2174 used = urg_offset;
2175 }2176 }2177
2178 /*2179 * Copy it - We _MUST_ update *seq first so that we2180 * don't ever double read when we have dual readers2181 */2182
2183 *seq += used;
2184
2185 /*2186 * This memcpy_tofs can sleep. If it sleeps and we2187 * do a second read it relies on the skb->users to avoid2188 * a crash when cleanup_rbuf() gets called.2189 */2190
2191 memcpy_tofs(to,((unsignedchar *)skb->h.th) +
2192 skb->h.th->doff*4 + offset, used);
2193 copied += used;
2194 len -= used;
2195 to += used;
2196
2197 /*2198 * We now will not sleep again until we are finished2199 * with skb. Sorry if you are doing the SMP port2200 * but you'll just have to fix it neatly ;)2201 */2202
2203 skb->users --;
2204
2205 if (after(sk->copied_seq,sk->urg_seq))
2206 sk->urg_data = 0;
2207 if (used + offset < skb->len)
2208 continue;
2209
2210 /*2211 * Process the FIN.2212 */2213
2214 if (skb->h.th->fin)
2215 gotofound_fin_ok;
2216 if (flags & MSG_PEEK)
2217 continue;
2218 skb->used = 1;
2219 continue;
2220
2221 found_fin_ok:
2222 ++*seq;
2223 if (flags & MSG_PEEK)
2224 break;
2225
2226 /*2227 * All is done2228 */2229
2230 skb->used = 1;
2231 sk->shutdown |= RCV_SHUTDOWN;
2232 break;
2233
2234 }2235 remove_wait_queue(sk->sleep, &wait);
2236 current->state = TASK_RUNNING;
2237
2238 /* Clean up data we have read: This will do ACK frames */2239 cleanup_rbuf(sk);
2240 release_sock(sk);
2241 returncopied;
2242 }2243
2244 /*2245 * State processing on a close. This implements the state shift for2246 * sending our FIN frame. Note that we only send a FIN for some 2247 * states. A shutdown() may have already sent the FIN, or we may be2248 * closed.2249 */2250
2251 staticinttcp_close_state(structsock *sk, intdead)
/* */2252 {2253 intns=TCP_CLOSE;
2254 intsend_fin=0;
2255 switch(sk->state)
2256 {2257 caseTCP_SYN_SENT: /* No SYN back, no FIN needed */2258 break;
2259 caseTCP_SYN_RECV:
2260 caseTCP_ESTABLISHED: /* Closedown begin */2261 ns=TCP_FIN_WAIT1;
2262 send_fin=1;
2263 break;
2264 caseTCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */2265 caseTCP_FIN_WAIT2:
2266 caseTCP_CLOSING:
2267 ns=sk->state;
2268 break;
2269 caseTCP_CLOSE:
2270 caseTCP_LISTEN:
2271 break;
2272 caseTCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and2273 wait only for the ACK */2274 ns=TCP_LAST_ACK;
2275 send_fin=1;
2276 }2277
2278 tcp_set_state(sk,ns);
2279
2280 /*2281 * This is a (useful) BSD violating of the RFC. There is a2282 * problem with TCP as specified in that the other end could2283 * keep a socket open forever with no application left this end.2284 * We use a 3 minute timeout (about the same as BSD) then kill2285 * our end. If they send after that then tough - BUT: long enough2286 * that we won't make the old 4*rto = almost no time - whoops2287 * reset mistake.2288 */2289 if(dead && ns==TCP_FIN_WAIT2)
2290 {2291 inttimer_active=del_timer(&sk->timer);
2292 if(timer_active)
2293 add_timer(&sk->timer);
2294 else2295 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2296 }2297
2298 returnsend_fin;
2299 }2300
2301 /*2302 * Send a fin.2303 */2304
2305 staticvoidtcp_send_fin(structsock *sk)
/* */2306 {2307 structproto *prot =(structproto *)sk->prot;
2308 structtcphdr *th =(structtcphdr *)&sk->dummy_th;
2309 structtcphdr *t1;
2310 structsk_buff *buff;
2311 structdevice *dev=NULL;
2312 inttmp;
2313
2314 release_sock(sk); /* in case the malloc sleeps. */2315
2316 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2317 sk->inuse = 1;
2318
2319 if (buff == NULL)
2320 {2321 /* This is a disaster if it occurs */2322 printk("tcp_send_fin: Impossible malloc failure");
2323 return;
2324 }2325
2326 /*2327 * Administrivia2328 */2329
2330 buff->sk = sk;
2331 buff->len = sizeof(*t1);
2332 buff->localroute = sk->localroute;
2333 t1 =(structtcphdr *) buff->data;
2334
2335 /*2336 * Put in the IP header and routing stuff. 2337 */2338
2339 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2340 IPPROTO_TCP, sk->opt,
2341 sizeof(structtcphdr),sk->ip_tos,sk->ip_ttl);
2342 if (tmp < 0)
2343 {2344 intt;
2345 /*2346 * Finish anyway, treat this as a send that got lost. 2347 * (Not good).2348 */2349
2350 buff->free = 1;
2351 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2352 sk->write_seq++;
2353 t=del_timer(&sk->timer);
2354 if(t)
2355 add_timer(&sk->timer);
2356 else2357 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2358 return;
2359 }2360
2361 /*2362 * We ought to check if the end of the queue is a buffer and2363 * if so simply add the fin to that buffer, not send it ahead.2364 */2365
2366 t1 =(structtcphdr *)((char *)t1 +tmp);
2367 buff->len += tmp;
2368 buff->dev = dev;
2369 memcpy(t1, th, sizeof(*t1));
2370 t1->seq = ntohl(sk->write_seq);
2371 sk->write_seq++;
2372 buff->h.seq = sk->write_seq;
2373 t1->ack = 1;
2374 t1->ack_seq = ntohl(sk->acked_seq);
2375 t1->window = ntohs(sk->window=tcp_select_window(sk));
2376 t1->fin = 1;
2377 t1->rst = 0;
2378 t1->doff = sizeof(*t1)/4;
2379 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2380
2381 /*2382 * If there is data in the write queue, the fin must be appended to2383 * the write queue.2384 */2385
2386 if (skb_peek(&sk->write_queue) != NULL)
2387 {2388 buff->free = 0;
2389 if (buff->next != NULL)
2390 {2391 printk("tcp_send_fin: next != NULL\n");
2392 skb_unlink(buff);
2393 }2394 skb_queue_tail(&sk->write_queue, buff);
2395 }2396 else2397 {2398 sk->sent_seq = sk->write_seq;
2399 sk->prot->queue_xmit(sk, dev, buff, 0);
2400 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2401 }2402 }2403
2404 /*2405 * Shutdown the sending side of a connection. Much like close except2406 * that we don't receive shut down or set sk->dead=1.2407 */2408
2409 voidtcp_shutdown(structsock *sk, inthow)
/* */2410 {2411 /*2412 * We need to grab some memory, and put together a FIN,2413 * and then put it into the queue to be sent.2414 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.2415 */2416
2417 if (!(how & SEND_SHUTDOWN))
2418 return;
2419
2420 /*2421 * If we've already sent a FIN, or it's a closed state2422 */2423
2424 if (sk->state == TCP_FIN_WAIT1 ||
2425 sk->state == TCP_FIN_WAIT2 ||
2426 sk->state == TCP_CLOSING ||
2427 sk->state == TCP_LAST_ACK ||
2428 sk->state == TCP_TIME_WAIT ||
2429 sk->state == TCP_CLOSE ||
2430 sk->state == TCP_LISTEN2431 )
2432 {2433 return;
2434 }2435 sk->inuse = 1;
2436
2437 /*2438 * flag that the sender has shutdown2439 */2440
2441 sk->shutdown |= SEND_SHUTDOWN;
2442
2443 /*2444 * Clear out any half completed packets. 2445 */2446
2447 if (sk->partial)
2448 tcp_send_partial(sk);
2449
2450 /*2451 * FIN if needed2452 */2453
2454 if(tcp_close_state(sk,0))
2455 tcp_send_fin(sk);
2456
2457 release_sock(sk);
2458 }2459
2460
2461 staticint2462 tcp_recvfrom(structsock *sk, unsignedchar *to,
/* */2463 intto_len, intnonblock, unsignedflags,
2464 structsockaddr_in *addr, int *addr_len)
2465 {2466 intresult;
2467
2468 /* 2469 * Have to check these first unlike the old code. If 2470 * we check them after we lose data on an error2471 * which is wrong 2472 */2473
2474 if(addr_len)
2475 *addr_len = sizeof(*addr);
2476 result=tcp_read(sk, to, to_len, nonblock, flags);
2477
2478 if (result < 0)
2479 return(result);
2480
2481 if(addr)
2482 {2483 addr->sin_family = AF_INET;
2484 addr->sin_port = sk->dummy_th.dest;
2485 addr->sin_addr.s_addr = sk->daddr;
2486 }2487 return(result);
2488 }2489
2490
2491 /*2492 * This routine will send an RST to the other tcp. 2493 */2494
2495 staticvoidtcp_reset(unsignedlongsaddr, unsignedlongdaddr, structtcphdr *th,
/* */2496 structproto *prot, structoptions *opt, structdevice *dev, inttos, intttl)
2497 {2498 structsk_buff *buff;
2499 structtcphdr *t1;
2500 inttmp;
2501 structdevice *ndev=NULL;
2502
2503 /*2504 * Cannot reset a reset (Think about it).2505 */2506
2507 if(th->rst)
2508 return;
2509
2510 /*2511 * We need to grab some memory, and put together an RST,2512 * and then put it into the queue to be sent.2513 */2514
2515 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2516 if (buff == NULL)
2517 return;
2518
2519 buff->len = sizeof(*t1);
2520 buff->sk = NULL;
2521 buff->dev = dev;
2522 buff->localroute = 0;
2523
2524 t1 =(structtcphdr *) buff->data;
2525
2526 /*2527 * Put in the IP header and routing stuff. 2528 */2529
2530 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2531 sizeof(structtcphdr),tos,ttl);
2532 if (tmp < 0)
2533 {2534 buff->free = 1;
2535 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2536 return;
2537 }2538
2539 t1 =(structtcphdr *)((char *)t1 +tmp);
2540 buff->len += tmp;
2541 memcpy(t1, th, sizeof(*t1));
2542
2543 /*2544 * Swap the send and the receive. 2545 */2546
2547 t1->dest = th->source;
2548 t1->source = th->dest;
2549 t1->rst = 1;
2550 t1->window = 0;
2551
2552 if(th->ack)
2553 {2554 t1->ack = 0;
2555 t1->seq = th->ack_seq;
2556 t1->ack_seq = 0;
2557 }2558 else2559 {2560 t1->ack = 1;
2561 if(!th->syn)
2562 t1->ack_seq=htonl(th->seq);
2563 else2564 t1->ack_seq=htonl(th->seq+1);
2565 t1->seq=0;
2566 }2567
2568 t1->syn = 0;
2569 t1->urg = 0;
2570 t1->fin = 0;
2571 t1->psh = 0;
2572 t1->doff = sizeof(*t1)/4;
2573 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2574 prot->queue_xmit(NULL, ndev, buff, 1);
2575 tcp_statistics.TcpOutSegs++;
2576 }2577
2578
2579 /*2580 * Look for tcp options. Parses everything but only knows about MSS.2581 * This routine is always called with the packet containing the SYN.2582 * However it may also be called with the ack to the SYN. So you2583 * can't assume this is always the SYN. It's always called after2584 * we have set up sk->mtu to our own MTU.2585 *2586 * We need at minimum to add PAWS support here. Possibly large windows2587 * as Linux gets deployed on 100Mb/sec networks.2588 */2589
2590 staticvoidtcp_options(structsock *sk, structtcphdr *th)
/* */2591 {2592 unsignedchar *ptr;
2593 intlength=(th->doff*4)-sizeof(structtcphdr);
2594 intmss_seen = 0;
2595
2596 ptr = (unsignedchar *)(th + 1);
2597
2598 while(length>0)
2599 {2600 intopcode=*ptr++;
2601 intopsize=*ptr++;
2602 switch(opcode)
2603 {2604 caseTCPOPT_EOL:
2605 return;
2606 caseTCPOPT_NOP: /* Ref: RFC 793 section 3.1 */2607 length--;
2608 ptr--; /* the opsize=*ptr++ above was a mistake */2609 continue;
2610
2611 default:
2612 if(opsize<=2) /* Avoid silly options looping forever */2613 return;
2614 switch(opcode)
2615 {2616 caseTCPOPT_MSS:
2617 if(opsize==4 && th->syn)
2618 {2619 sk->mtu=min(sk->mtu,ntohs(*(unsignedshort *)ptr));
2620 mss_seen = 1;
2621 }2622 break;
2623 /* Add other options here as people feel the urge to implement stuff like large windows */2624 }2625 ptr+=opsize-2;
2626 length-=opsize;
2627 }2628 }2629 if (th->syn)
2630 {2631 if (! mss_seen)
2632 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */2633 }2634 #ifdefCONFIG_INET_PCTCP2635 sk->mss = min(sk->max_window >> 1, sk->mtu);
2636 #else2637 sk->mss = min(sk->max_window, sk->mtu);
2638 #endif2639 }2640
2641 staticinlineunsignedlongdefault_mask(unsignedlongdst)
/* */2642 {2643 dst = ntohl(dst);
2644 if (IN_CLASSA(dst))
2645 returnhtonl(IN_CLASSA_NET);
2646 if (IN_CLASSB(dst))
2647 returnhtonl(IN_CLASSB_NET);
2648 returnhtonl(IN_CLASSC_NET);
2649 }2650
2651 /*2652 * Default sequence number picking algorithm.2653 * As close as possible to RFC 793, which2654 * suggests using a 250kHz clock.2655 * Further reading shows this assumes 2MB/s networks.2656 * For 10MB/s ethernet, a 1MHz clock is appropriate.2657 * That's funny, Linux has one built in! Use it!2658 */2659
2660 externinlineunsignedlongtcp_init_seq(void)
/* */2661 {2662 structtimevaltv;
2663 do_gettimeofday(&tv);
2664 returntv.tv_usec+tv.tv_sec*1000000;
2665 }2666
2667 /*2668 * This routine handles a connection request.2669 * It should make sure we haven't already responded.2670 * Because of the way BSD works, we have to send a syn/ack now.2671 * This also means it will be harder to close a socket which is2672 * listening.2673 */2674
2675 staticvoidtcp_conn_request(structsock *sk, structsk_buff *skb,
/* */2676 unsignedlongdaddr, unsignedlongsaddr,
2677 structoptions *opt, structdevice *dev, unsignedlongseq)
2678 {2679 structsk_buff *buff;
2680 structtcphdr *t1;
2681 unsignedchar *ptr;
2682 structsock *newsk;
2683 structtcphdr *th;
2684 structdevice *ndev=NULL;
2685 inttmp;
2686 structrtable *rt;
2687
2688 th = skb->h.th;
2689
2690 /* If the socket is dead, don't accept the connection. */2691 if (!sk->dead)
2692 {2693 sk->data_ready(sk,0);
2694 }2695 else2696 {2697 if(sk->debug)
2698 printk("Reset on %p: Connect on dead socket.\n",sk);
2699 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2700 tcp_statistics.TcpAttemptFails++;
2701 kfree_skb(skb, FREE_READ);
2702 return;
2703 }2704
2705 /*2706 * Make sure we can accept more. This will prevent a2707 * flurry of syns from eating up all our memory.2708 */2709
2710 if (sk->ack_backlog >= sk->max_ack_backlog)
2711 {2712 tcp_statistics.TcpAttemptFails++;
2713 kfree_skb(skb, FREE_READ);
2714 return;
2715 }2716
2717 /*2718 * We need to build a new sock struct.2719 * It is sort of bad to have a socket without an inode attached2720 * to it, but the wake_up's will just wake up the listening socket,2721 * and if the listening socket is destroyed before this is taken2722 * off of the queue, this will take care of it.2723 */2724
2725 newsk = (structsock *) kmalloc(sizeof(structsock), GFP_ATOMIC);
2726 if (newsk == NULL)
2727 {2728 /* just ignore the syn. It will get retransmitted. */2729 tcp_statistics.TcpAttemptFails++;
2730 kfree_skb(skb, FREE_READ);
2731 return;
2732 }2733
2734 memcpy(newsk, sk, sizeof(*newsk));
2735 skb_queue_head_init(&newsk->write_queue);
2736 skb_queue_head_init(&newsk->receive_queue);
2737 newsk->send_head = NULL;
2738 newsk->send_tail = NULL;
2739 skb_queue_head_init(&newsk->back_log);
2740 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/2741 newsk->rto = TCP_TIMEOUT_INIT;
2742 newsk->mdev = 0;
2743 newsk->max_window = 0;
2744 newsk->cong_window = 1;
2745 newsk->cong_count = 0;
2746 newsk->ssthresh = 0;
2747 newsk->backoff = 0;
2748 newsk->blog = 0;
2749 newsk->intr = 0;
2750 newsk->proc = 0;
2751 newsk->done = 0;
2752 newsk->partial = NULL;
2753 newsk->pair = NULL;
2754 newsk->wmem_alloc = 0;
2755 newsk->rmem_alloc = 0;
2756 newsk->localroute = sk->localroute;
2757
2758 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2759
2760 newsk->err = 0;
2761 newsk->shutdown = 0;
2762 newsk->ack_backlog = 0;
2763 newsk->acked_seq = skb->h.th->seq+1;
2764 newsk->copied_seq = skb->h.th->seq+1;
2765 newsk->fin_seq = skb->h.th->seq;
2766 newsk->state = TCP_SYN_RECV;
2767 newsk->timeout = 0;
2768 newsk->ip_xmit_timeout = 0;
2769 newsk->write_seq = seq;
2770 newsk->window_seq = newsk->write_seq;
2771 newsk->rcv_ack_seq = newsk->write_seq;
2772 newsk->urg_data = 0;
2773 newsk->retransmits = 0;
2774 newsk->linger=0;
2775 newsk->destroy = 0;
2776 init_timer(&newsk->timer);
2777 newsk->timer.data = (unsignedlong)newsk;
2778 newsk->timer.function = &net_timer;
2779 init_timer(&newsk->retransmit_timer);
2780 newsk->retransmit_timer.data = (unsignedlong)newsk;
2781 newsk->retransmit_timer.function=&retransmit_timer;
2782 newsk->dummy_th.source = skb->h.th->dest;
2783 newsk->dummy_th.dest = skb->h.th->source;
2784
2785 /*2786 * Swap these two, they are from our point of view. 2787 */2788
2789 newsk->daddr = saddr;
2790 newsk->saddr = daddr;
2791
2792 put_sock(newsk->num,newsk);
2793 newsk->dummy_th.res1 = 0;
2794 newsk->dummy_th.doff = 6;
2795 newsk->dummy_th.fin = 0;
2796 newsk->dummy_th.syn = 0;
2797 newsk->dummy_th.rst = 0;
2798 newsk->dummy_th.psh = 0;
2799 newsk->dummy_th.ack = 0;
2800 newsk->dummy_th.urg = 0;
2801 newsk->dummy_th.res2 = 0;
2802 newsk->acked_seq = skb->h.th->seq + 1;
2803 newsk->copied_seq = skb->h.th->seq + 1;
2804 newsk->socket = NULL;
2805
2806 /*2807 * Grab the ttl and tos values and use them 2808 */2809
2810 newsk->ip_ttl=sk->ip_ttl;
2811 newsk->ip_tos=skb->ip_hdr->tos;
2812
2813 /*2814 * Use 512 or whatever user asked for 2815 */2816
2817 /*2818 * Note use of sk->user_mss, since user has no direct access to newsk 2819 */2820
2821 rt=ip_rt_route(saddr, NULL,NULL);
2822
2823 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2824 newsk->window_clamp = rt->rt_window;
2825 else2826 newsk->window_clamp = 0;
2827
2828 if (sk->user_mss)
2829 newsk->mtu = sk->user_mss;
2830 elseif(rt!=NULL && (rt->rt_flags&RTF_MSS))
2831 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2832 else2833 {2834 #ifdefCONFIG_INET_SNARL/* Sub Nets Are Local */2835 if ((saddr ^ daddr) & default_mask(saddr))
2836 #else2837 if ((saddr ^ daddr) & dev->pa_mask)
2838 #endif2839 newsk->mtu = 576 - HEADER_SIZE;
2840 else2841 newsk->mtu = MAX_WINDOW;
2842 }2843
2844 /*2845 * But not bigger than device MTU 2846 */2847
2848 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2849
2850 /*2851 * This will min with what arrived in the packet 2852 */2853
2854 tcp_options(newsk,skb->h.th);
2855
2856 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2857 if (buff == NULL)
2858 {2859 sk->err = ENOMEM;
2860 newsk->dead = 1;
2861 newsk->state = TCP_CLOSE;
2862 /* And this will destroy it */2863 release_sock(newsk);
2864 kfree_skb(skb, FREE_READ);
2865 tcp_statistics.TcpAttemptFails++;
2866 return;
2867 }2868
2869 buff->len = sizeof(structtcphdr)+4;
2870 buff->sk = newsk;
2871 buff->localroute = newsk->localroute;
2872
2873 t1 =(structtcphdr *) buff->data;
2874
2875 /*2876 * Put in the IP header and routing stuff. 2877 */2878
2879 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2880 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2881
2882 /*2883 * Something went wrong. 2884 */2885
2886 if (tmp < 0)
2887 {2888 sk->err = tmp;
2889 buff->free = 1;
2890 kfree_skb(buff,FREE_WRITE);
2891 newsk->dead = 1;
2892 newsk->state = TCP_CLOSE;
2893 release_sock(newsk);
2894 skb->sk = sk;
2895 kfree_skb(skb, FREE_READ);
2896 tcp_statistics.TcpAttemptFails++;
2897 return;
2898 }2899
2900 buff->len += tmp;
2901 t1 =(structtcphdr *)((char *)t1 +tmp);
2902
2903 memcpy(t1, skb->h.th, sizeof(*t1));
2904 buff->h.seq = newsk->write_seq;
2905 /*2906 * Swap the send and the receive. 2907 */2908 t1->dest = skb->h.th->source;
2909 t1->source = newsk->dummy_th.source;
2910 t1->seq = ntohl(newsk->write_seq++);
2911 t1->ack = 1;
2912 newsk->window = tcp_select_window(newsk);
2913 newsk->sent_seq = newsk->write_seq;
2914 t1->window = ntohs(newsk->window);
2915 t1->res1 = 0;
2916 t1->res2 = 0;
2917 t1->rst = 0;
2918 t1->urg = 0;
2919 t1->psh = 0;
2920 t1->syn = 1;
2921 t1->ack_seq = ntohl(skb->h.th->seq+1);
2922 t1->doff = sizeof(*t1)/4+1;
2923 ptr =(unsignedchar *)(t1+1);
2924 ptr[0] = 2;
2925 ptr[1] = 4;
2926 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2927 ptr[3] =(newsk->mtu) & 0xff;
2928
2929 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2930 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2931 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2932 skb->sk = newsk;
2933
2934 /*2935 * Charge the sock_buff to newsk. 2936 */2937
2938 sk->rmem_alloc -= skb->mem_len;
2939 newsk->rmem_alloc += skb->mem_len;
2940
2941 skb_queue_tail(&sk->receive_queue,skb);
2942 sk->ack_backlog++;
2943 release_sock(newsk);
2944 tcp_statistics.TcpOutSegs++;
2945 }2946
2947
2948 staticvoidtcp_close(structsock *sk, inttimeout)
/* */2949 {2950 /*2951 * We need to grab some memory, and put together a FIN, 2952 * and then put it into the queue to be sent.2953 */2954
2955 sk->inuse = 1;
2956
2957 if(sk->state == TCP_LISTEN)
2958 {2959 /* Special case */2960 tcp_set_state(sk, TCP_CLOSE);
2961 tcp_close_pending(sk);
2962 release_sock(sk);
2963 return;
2964 }2965
2966 sk->keepopen = 1;
2967 sk->shutdown = SHUTDOWN_MASK;
2968
2969 if (!sk->dead)
2970 sk->state_change(sk);
2971
2972 if (timeout == 0)
2973 {2974 structsk_buff *skb;
2975
2976 /*2977 * We need to flush the recv. buffs. We do this only on the2978 * descriptor close, not protocol-sourced closes, because the2979 * reader process may not have drained the data yet!2980 */2981
2982 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2983 kfree_skb(skb, FREE_READ);
2984 /*2985 * Get rid off any half-completed packets. 2986 */2987
2988 if (sk->partial)
2989 tcp_send_partial(sk);
2990 }2991
2992
2993 /*2994 * Timeout is not the same thing - however the code likes2995 * to send both the same way (sigh).2996 */2997
2998 if(timeout)
2999 {3000 tcp_set_state(sk, TCP_CLOSE); /* Dead */3001 }3002 else3003 {3004 if(tcp_close_state(sk,1)==1)
3005 {3006 tcp_send_fin(sk);
3007 }3008 }3009 release_sock(sk);
3010 }3011
3012
3013 /*3014 * This routine takes stuff off of the write queue,3015 * and puts it in the xmit queue. This happens as incoming acks3016 * open up the remote window for us.3017 */3018
3019 staticvoidtcp_write_xmit(structsock *sk)
/* */3020 {3021 structsk_buff *skb;
3022
3023 /*3024 * The bytes will have to remain here. In time closedown will3025 * empty the write queue and all will be happy 3026 */3027
3028 if(sk->zapped)
3029 return;
3030
3031 /*3032 * Anything on the transmit queue that fits the window can3033 * be added providing we are not3034 *3035 * a) retransmitting (Nagle's rule)3036 * b) exceeding our congestion window.3037 */3038
3039 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3040 before(skb->h.seq, sk->window_seq + 1) &&
3041 (sk->retransmits == 0 ||
3042 sk->ip_xmit_timeout != TIME_WRITE ||
3043 before(skb->h.seq, sk->rcv_ack_seq + 1))
3044 && sk->packets_out < sk->cong_window)
3045 {3046 IS_SKB(skb);
3047 skb_unlink(skb);
3048
3049 /*3050 * See if we really need to send the packet. 3051 */3052
3053 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3054 {3055 /*3056 * This is acked data. We can discard it. This 3057 * cannot currently occur.3058 */3059
3060 sk->retransmits = 0;
3061 kfree_skb(skb, FREE_WRITE);
3062 if (!sk->dead)
3063 sk->write_space(sk);
3064 }3065 else3066 {3067 structtcphdr *th;
3068 structiphdr *iph;
3069 intsize;
3070 /*3071 * put in the ack seq and window at this point rather than earlier,3072 * in order to keep them monotonic. We really want to avoid taking3073 * back window allocations. That's legal, but RFC1122 says it's frowned on.3074 * Ack and window will in general have changed since this packet was put3075 * on the write queue.3076 */3077 iph = (structiphdr *)(skb->data +
3078 skb->dev->hard_header_len);
3079 th = (structtcphdr *)(((char *)iph) +(iph->ihl << 2));
3080 size = skb->len - (((unsignedchar *) th) - skb->data);
3081
3082 th->ack_seq = ntohl(sk->acked_seq);
3083 th->window = ntohs(tcp_select_window(sk));
3084
3085 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3086
3087 sk->sent_seq = skb->h.seq;
3088
3089 /*3090 * IP manages our queue for some crazy reason3091 */3092
3093 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3094
3095 /*3096 * Again we slide the timer wrongly3097 */3098
3099 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3100 }3101 }3102 }3103
3104
3105 /*3106 * This routine deals with incoming acks, but not outgoing ones.3107 */3108
3109 extern__inline__inttcp_ack(structsock *sk, structtcphdr *th, unsignedlongsaddr, intlen)
/* */3110 {3111 unsignedlongack;
3112 intflag = 0;
3113
3114 /* 3115 * 1 - there was data in packet as well as ack or new data is sent or 3116 * in shutdown state3117 * 2 - data from retransmit queue was acked and removed3118 * 4 - window shrunk or data from retransmit queue was acked and removed3119 */3120
3121 if(sk->zapped)
3122 return(1); /* Dead, cant ack any more so why bother */3123
3124 /*3125 * Have we discovered a larger window3126 */3127
3128 ack = ntohl(th->ack_seq);
3129
3130 if (ntohs(th->window) > sk->max_window)
3131 {3132 sk->max_window = ntohs(th->window);
3133 #ifdefCONFIG_INET_PCTCP3134 /* Hack because we don't send partial packets to non SWS3135 handling hosts */3136 sk->mss = min(sk->max_window>>1, sk->mtu);
3137 #else3138 sk->mss = min(sk->max_window, sk->mtu);
3139 #endif3140 }3141
3142 /*3143 * We have dropped back to keepalive timeouts. Thus we have3144 * no retransmits pending.3145 */3146
3147 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3148 sk->retransmits = 0;
3149
3150 /*3151 * If the ack is newer than sent or older than previous acks3152 * then we can probably ignore it.3153 */3154
3155 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3156 {3157 if(sk->debug)
3158 printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3159
3160 /*3161 * Keepalive processing.3162 */3163
3164 if (after(ack, sk->sent_seq))
3165 {3166 return(0);
3167 }3168
3169 /*3170 * Restart the keepalive timer.3171 */3172
3173 if (sk->keepopen)
3174 {3175 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3176 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3177 }3178 return(1);
3179 }3180
3181 /*3182 * If there is data set flag 13183 */3184
3185 if (len != th->doff*4)
3186 flag |= 1;
3187
3188 /*3189 * See if our window has been shrunk. 3190 */3191
3192 if (after(sk->window_seq, ack+ntohs(th->window)))
3193 {3194 /*3195 * We may need to move packets from the send queue3196 * to the write queue, if the window has been shrunk on us.3197 * The RFC says you are not allowed to shrink your window3198 * like this, but if the other end does, you must be able3199 * to deal with it.3200 */3201 structsk_buff *skb;
3202 structsk_buff *skb2;
3203 structsk_buff *wskb = NULL;
3204
3205 skb2 = sk->send_head;
3206 sk->send_head = NULL;
3207 sk->send_tail = NULL;
3208
3209 /*3210 * This is an artifact of a flawed concept. We want one3211 * queue and a smarter send routine when we send all.3212 */3213
3214 flag |= 4; /* Window changed */3215
3216 sk->window_seq = ack + ntohs(th->window);
3217 cli();
3218 while (skb2 != NULL)
3219 {3220 skb = skb2;
3221 skb2 = skb->link3;
3222 skb->link3 = NULL;
3223 if (after(skb->h.seq, sk->window_seq))
3224 {3225 if (sk->packets_out > 0)
3226 sk->packets_out--;
3227 /* We may need to remove this from the dev send list. */3228 if (skb->next != NULL)
3229 {3230 skb_unlink(skb);
3231 }3232 /* Now add it to the write_queue. */3233 if (wskb == NULL)
3234 skb_queue_head(&sk->write_queue,skb);
3235 else3236 skb_append(wskb,skb);
3237 wskb = skb;
3238 }3239 else3240 {3241 if (sk->send_head == NULL)
3242 {3243 sk->send_head = skb;
3244 sk->send_tail = skb;
3245 }3246 else3247 {3248 sk->send_tail->link3 = skb;
3249 sk->send_tail = skb;
3250 }3251 skb->link3 = NULL;
3252 }3253 }3254 sti();
3255 }3256
3257 /*3258 * Pipe has emptied3259 */3260
3261 if (sk->send_tail == NULL || sk->send_head == NULL)
3262 {3263 sk->send_head = NULL;
3264 sk->send_tail = NULL;
3265 sk->packets_out= 0;
3266 }3267
3268 /*3269 * Update the right hand window edge of the host3270 */3271
3272 sk->window_seq = ack + ntohs(th->window);
3273
3274 /*3275 * We don't want too many packets out there. 3276 */3277
3278 if (sk->ip_xmit_timeout == TIME_WRITE &&
3279 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3280 {3281 /* 3282 * This is Jacobson's slow start and congestion avoidance. 3283 * SIGCOMM '88, p. 328. Because we keep cong_window in integral3284 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a 3285 * counter and increment it once every cwnd times. It's possible3286 * that this should be done only if sk->retransmits == 0. I'm3287 * interpreting "new data is acked" as including data that has3288 * been retransmitted but is just now being acked.3289 */3290 if (sk->cong_window < sk->ssthresh)
3291 /* 3292 * In "safe" area, increase3293 */3294 sk->cong_window++;
3295 else3296 {3297 /*3298 * In dangerous area, increase slowly. In theory this is3299 * sk->cong_window += 1 / sk->cong_window3300 */3301 if (sk->cong_count >= sk->cong_window)
3302 {3303 sk->cong_window++;
3304 sk->cong_count = 0;
3305 }3306 else3307 sk->cong_count++;
3308 }3309 }3310
3311 /*3312 * Remember the highest ack received.3313 */3314
3315 sk->rcv_ack_seq = ack;
3316
3317 /*3318 * If this ack opens up a zero window, clear backoff. It was3319 * being used to time the probes, and is probably far higher than3320 * it needs to be for normal retransmission.3321 */3322
3323 if (sk->ip_xmit_timeout == TIME_PROBE0)
3324 {3325 sk->retransmits = 0; /* Our probe was answered */3326
3327 /*3328 * Was it a usable window open ?3329 */3330
3331 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */3332 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3333 {3334 sk->backoff = 0;
3335
3336 /*3337 * Recompute rto from rtt. this eliminates any backoff.3338 */3339
3340 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3341 if (sk->rto > 120*HZ)
3342 sk->rto = 120*HZ;
3343 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about3344 .2 of a second because of BSD delayed acks - on a 100Mb/sec link3345 .2 of a second is going to need huge windows (SIGH) */3346 sk->rto = 20;
3347 }3348 }3349
3350 /* 3351 * See if we can take anything off of the retransmit queue.3352 */3353
3354 while(sk->send_head != NULL)
3355 {3356 /* Check for a bug. */3357 if (sk->send_head->link3 &&
3358 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3359 printk("INET: tcp.c: *** bug send_list out of order.\n");
3360
3361 /*3362 * If our packet is before the ack sequence we can3363 * discard it as it's confirmed to have arrived the other end.3364 */3365
3366 if (before(sk->send_head->h.seq, ack+1))
3367 {3368 structsk_buff *oskb;
3369 if (sk->retransmits)
3370 {3371 /*3372 * We were retransmitting. don't count this in RTT est 3373 */3374 flag |= 2;
3375
3376 /*3377 * even though we've gotten an ack, we're still3378 * retransmitting as long as we're sending from3379 * the retransmit queue. Keeping retransmits non-zero3380 * prevents us from getting new data interspersed with3381 * retransmissions.3382 */3383
3384 if (sk->send_head->link3) /* Any more queued retransmits? */3385 sk->retransmits = 1;
3386 else3387 sk->retransmits = 0;
3388 }3389 /*3390 * Note that we only reset backoff and rto in the3391 * rtt recomputation code. And that doesn't happen3392 * if there were retransmissions in effect. So the3393 * first new packet after the retransmissions is3394 * sent with the backoff still in effect. Not until3395 * we get an ack from a non-retransmitted packet do3396 * we reset the backoff and rto. This allows us to deal3397 * with a situation where the network delay has increased3398 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)3399 */3400
3401 /*3402 * We have one less packet out there. 3403 */3404
3405 if (sk->packets_out > 0)
3406 sk->packets_out --;
3407 /* 3408 * Wake up the process, it can probably write more. 3409 */3410 if (!sk->dead)
3411 sk->write_space(sk);
3412 oskb = sk->send_head;
3413
3414 if (!(flag&2)) /* Not retransmitting */3415 {3416 longm;
3417
3418 /*3419 * The following amusing code comes from Jacobson's3420 * article in SIGCOMM '88. Note that rtt and mdev3421 * are scaled versions of rtt and mean deviation.3422 * This is designed to be as fast as possible 3423 * m stands for "measurement".3424 */3425
3426 m = jiffies - oskb->when; /* RTT */3427 if(m<=0)
3428 m=1; /* IS THIS RIGHT FOR <0 ??? */3429 m -= (sk->rtt >> 3); /* m is now error in rtt est */3430 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */3431 if (m < 0)
3432 m = -m; /* m is now abs(error) */3433 m -= (sk->mdev >> 2); /* similar update on mdev */3434 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */3435
3436 /*3437 * Now update timeout. Note that this removes any backoff.3438 */3439
3440 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3441 if (sk->rto > 120*HZ)
3442 sk->rto = 120*HZ;
3443 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */3444 sk->rto = 20;
3445 sk->backoff = 0;
3446 }3447 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt 3448 In this case as we just set it up */3449 cli();
3450 oskb = sk->send_head;
3451 IS_SKB(oskb);
3452 sk->send_head = oskb->link3;
3453 if (sk->send_head == NULL)
3454 {3455 sk->send_tail = NULL;
3456 }3457
3458 /*3459 * We may need to remove this from the dev send list. 3460 */3461
3462 if (oskb->next)
3463 skb_unlink(oskb);
3464 sti();
3465 kfree_skb(oskb, FREE_WRITE); /* write. */3466 if (!sk->dead)
3467 sk->write_space(sk);
3468 }3469 else3470 {3471 break;
3472 }3473 }3474
3475 /*3476 * XXX someone ought to look at this too.. at the moment, if skb_peek()3477 * returns non-NULL, we complete ignore the timer stuff in the else3478 * clause. We ought to organize the code so that else clause can3479 * (should) be executed regardless, possibly moving the PROBE timer3480 * reset over. The skb_peek() thing should only move stuff to the3481 * write queue, NOT also manage the timer functions.3482 */3483
3484 /*3485 * Maybe we can take some stuff off of the write queue,3486 * and put it onto the xmit queue.3487 */3488 if (skb_peek(&sk->write_queue) != NULL)
3489 {3490 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3491 (sk->retransmits == 0 ||
3492 sk->ip_xmit_timeout != TIME_WRITE ||
3493 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3494 && sk->packets_out < sk->cong_window)
3495 {3496 /*3497 * Add more data to the send queue.3498 */3499 flag |= 1;
3500 tcp_write_xmit(sk);
3501 }3502 elseif (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3503 sk->send_head == NULL &&
3504 sk->ack_backlog == 0 &&
3505 sk->state != TCP_TIME_WAIT)
3506 {3507 /*3508 * Data to queue but no room.3509 */3510 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3511 }3512 }3513 else3514 {3515 /*3516 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets3517 * from TCP_CLOSE we don't do anything3518 *3519 * from anything else, if there is write data (or fin) pending,3520 * we use a TIME_WRITE timeout, else if keepalive we reset to3521 * a KEEPALIVE timeout, else we delete the timer.3522 *3523 * We do not set flag for nominal write data, otherwise we may3524 * force a state where we start to write itsy bitsy tidbits3525 * of data.3526 */3527
3528 switch(sk->state) {3529 caseTCP_TIME_WAIT:
3530 /*3531 * keep us in TIME_WAIT until we stop getting packets,3532 * reset the timeout.3533 */3534 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3535 break;
3536 caseTCP_CLOSE:
3537 /*3538 * don't touch the timer.3539 */3540 break;
3541 default:
3542 /*3543 * Must check send_head, write_queue, and ack_backlog3544 * to determine which timeout to use.3545 */3546 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {3547 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3548 }elseif (sk->keepopen) {3549 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3550 }else{3551 del_timer(&sk->retransmit_timer);
3552 sk->ip_xmit_timeout = 0;
3553 }3554 break;
3555 }3556 }3557
3558 /*3559 * We have nothing queued but space to send. Send any partial3560 * packets immediately (end of Nagle rule application).3561 */3562
3563 if (sk->packets_out == 0 && sk->partial != NULL &&
3564 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3565 {3566 flag |= 1;
3567 tcp_send_partial(sk);
3568 }3569
3570 /*3571 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and3572 * we are now waiting for an acknowledge to our FIN. The other end is3573 * already in TIME_WAIT.3574 *3575 * Move to TCP_CLOSE on success.3576 */3577
3578 if (sk->state == TCP_LAST_ACK)
3579 {3580 if (!sk->dead)
3581 sk->state_change(sk);
3582 if(sk->debug)
3583 printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3584 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3585 if (sk->rcv_ack_seq == sk->write_seq/*&& sk->acked_seq == sk->fin_seq*/)
3586 {3587 flag |= 1;
3588 tcp_set_state(sk,TCP_CLOSE);
3589 sk->shutdown = SHUTDOWN_MASK;
3590 }3591 }3592
3593 /*3594 * Incoming ACK to a FIN we sent in the case of our initiating the close.3595 *3596 * Move to FIN_WAIT2 to await a FIN from the other end. Set3597 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.3598 */3599
3600 if (sk->state == TCP_FIN_WAIT1)
3601 {3602
3603 if (!sk->dead)
3604 sk->state_change(sk);
3605 if (sk->rcv_ack_seq == sk->write_seq)
3606 {3607 flag |= 1;
3608 sk->shutdown |= SEND_SHUTDOWN;
3609 tcp_set_state(sk, TCP_FIN_WAIT2);
3610 }3611 }3612
3613 /*3614 * Incoming ACK to a FIN we sent in the case of a simultaneous close.3615 *3616 * Move to TIME_WAIT3617 */3618
3619 if (sk->state == TCP_CLOSING)
3620 {3621
3622 if (!sk->dead)
3623 sk->state_change(sk);
3624 if (sk->rcv_ack_seq == sk->write_seq)
3625 {3626 flag |= 1;
3627 tcp_time_wait(sk);
3628 }3629 }3630
3631 /*3632 * Final ack of a three way shake 3633 */3634
3635 if(sk->state==TCP_SYN_RECV)
3636 {3637 tcp_set_state(sk, TCP_ESTABLISHED);
3638 tcp_options(sk,th);
3639 sk->dummy_th.dest=th->source;
3640 sk->copied_seq = sk->acked_seq;
3641 if(!sk->dead)
3642 sk->state_change(sk);
3643 if(sk->max_window==0)
3644 {3645 sk->max_window=32; /* Sanity check */3646 sk->mss=min(sk->max_window,sk->mtu);
3647 }3648 }3649
3650 /*3651 * I make no guarantees about the first clause in the following3652 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under3653 * what conditions "!flag" would be true. However I think the rest3654 * of the conditions would prevent that from causing any3655 * unnecessary retransmission. 3656 * Clearly if the first packet has expired it should be 3657 * retransmitted. The other alternative, "flag&2 && retransmits", is3658 * harder to explain: You have to look carefully at how and when the3659 * timer is set and with what timeout. The most recent transmission always3660 * sets the timer. So in general if the most recent thing has timed3661 * out, everything before it has as well. So we want to go ahead and3662 * retransmit some more. If we didn't explicitly test for this3663 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"3664 * would not be true. If you look at the pattern of timing, you can3665 * show that rto is increased fast enough that the next packet would3666 * almost never be retransmitted immediately. Then you'd end up3667 * waiting for a timeout to send each packet on the retransmission3668 * queue. With my implementation of the Karn sampling algorithm,3669 * the timeout would double each time. The net result is that it would3670 * take a hideous amount of time to recover from a single dropped packet.3671 * It's possible that there should also be a test for TIME_WRITE, but3672 * I think as long as "send_head != NULL" and "retransmit" is on, we've3673 * got to be in real retransmission mode.3674 * Note that tcp_do_retransmit is called with all==1. Setting cong_window3675 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.3676 * As long as no further losses occur, this seems reasonable.3677 */3678
3679 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3680 (((flag&2) && sk->retransmits) ||
3681 (sk->send_head->when + sk->rto < jiffies)))
3682 {3683 if(sk->send_head->when + sk->rto < jiffies)
3684 tcp_retransmit(sk,0);
3685 else3686 {3687 tcp_do_retransmit(sk, 1);
3688 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3689 }3690 }3691
3692 return(1);
3693 }3694
3695
3696 /*3697 * Process the FIN bit. This now behaves as it is supposed to work3698 * and the FIN takes effect when it is validly part of sequence3699 * space. Not before when we get holes.3700 *3701 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT3702 * (and thence onto LAST-ACK and finally, CLOSE, we never enter3703 * TIME-WAIT)3704 *3705 * If we are in FINWAIT-1, a received FIN indicates simultaneous3706 * close and we go into CLOSING (and later onto TIME-WAIT)3707 *3708 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.3709 *3710 */3711
3712 staticinttcp_fin(structsk_buff *skb, structsock *sk, structtcphdr *th)
/* */3713 {3714 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3715
3716 if (!sk->dead)
3717 {3718 sk->state_change(sk);
3719 sock_wake_async(sk->socket, 1);
3720 }3721
3722 switch(sk->state)
3723 {3724 caseTCP_SYN_RECV:
3725 caseTCP_SYN_SENT:
3726 caseTCP_ESTABLISHED:
3727 /*3728 * move to CLOSE_WAIT, tcp_data() already handled3729 * sending the ack.3730 */3731 tcp_set_state(sk,TCP_CLOSE_WAIT);
3732 if (th->rst)
3733 sk->shutdown = SHUTDOWN_MASK;
3734 break;
3735
3736 caseTCP_CLOSE_WAIT:
3737 caseTCP_CLOSING:
3738 /*3739 * received a retransmission of the FIN, do3740 * nothing.3741 */3742 break;
3743 caseTCP_TIME_WAIT:
3744 /*3745 * received a retransmission of the FIN,3746 * restart the TIME_WAIT timer.3747 */3748 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3749 return(0);
3750 caseTCP_FIN_WAIT1:
3751 /*3752 * This case occurs when a simultaneous close3753 * happens, we must ack the received FIN and3754 * enter the CLOSING state.3755 *3756 * This causes a WRITE timeout, which will either3757 * move on to TIME_WAIT when we timeout, or resend3758 * the FIN properly (maybe we get rid of that annoying3759 * FIN lost hang). The TIME_WRITE code is already correct3760 * for handling this timeout.3761 */3762
3763 if(sk->ip_xmit_timeout != TIME_WRITE)
3764 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3765 tcp_set_state(sk,TCP_CLOSING);
3766 break;
3767 caseTCP_FIN_WAIT2:
3768 /*3769 * received a FIN -- send ACK and enter TIME_WAIT3770 */3771 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3772 sk->shutdown|=SHUTDOWN_MASK;
3773 tcp_set_state(sk,TCP_TIME_WAIT);
3774 break;
3775 caseTCP_CLOSE:
3776 /*3777 * already in CLOSE3778 */3779 break;
3780 default:
3781 tcp_set_state(sk,TCP_LAST_ACK);
3782
3783 /* Start the timers. */3784 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3785 return(0);
3786 }3787
3788 return(0);
3789 }3790
3791
3792
3793 /*3794 * This routine handles the data. If there is room in the buffer,3795 * it will be have already been moved into it. If there is no3796 * room, then we will just have to discard the packet.3797 */3798
3799 extern__inline__inttcp_data(structsk_buff *skb, structsock *sk,
/* */3800 unsignedlongsaddr, unsignedshortlen)
3801 {3802 structsk_buff *skb1, *skb2;
3803 structtcphdr *th;
3804 intdup_dumped=0;
3805 unsignedlongnew_seq;
3806 unsignedlongshut_seq;
3807
3808 th = skb->h.th;
3809 skb->len = len -(th->doff*4);
3810
3811 /*3812 * The bytes in the receive read/assembly queue has increased. Needed for the3813 * low memory discard algorithm 3814 */3815
3816 sk->bytes_rcv += skb->len;
3817
3818 if (skb->len == 0 && !th->fin)
3819 {3820 /* 3821 * Don't want to keep passing ack's back and forth. 3822 * (someone sent us dataless, boring frame)3823 */3824 if (!th->ack)
3825 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3826 kfree_skb(skb, FREE_READ);
3827 return(0);
3828 }3829
3830 /*3831 * We no longer have anyone receiving data on this connection.3832 */3833
3834 #ifndef TCP_DONT_RST_SHUTDOWN
3835
3836 if(sk->shutdown & RCV_SHUTDOWN)
3837 {3838 /*3839 * FIXME: BSD has some magic to avoid sending resets to3840 * broken 4.2 BSD keepalives. Much to my surprise a few non3841 * BSD stacks still have broken keepalives so we want to3842 * cope with it.3843 */3844
3845 if(skb->len) /* We don't care if it's just an ack or3846 a keepalive/window probe */3847 {3848 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */3849
3850 /* Do this the way 4.4BSD treats it. Not what I'd3851 regard as the meaning of the spec but it's what BSD3852 does and clearly they know everything 8) */3853
3854 /*3855 * This is valid because of two things3856 *3857 * a) The way tcp_data behaves at the bottom.3858 * b) A fin takes effect when read not when received.3859 */3860
3861 shut_seq=sk->acked_seq+1; /* Last byte */3862
3863 if(after(new_seq,shut_seq))
3864 {3865 if(sk->debug)
3866 printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3867 sk, new_seq, shut_seq, sk->blog);
3868 if(sk->dead)
3869 {3870 sk->acked_seq = new_seq + th->fin;
3871 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3872 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3873 tcp_statistics.TcpEstabResets++;
3874 tcp_set_state(sk,TCP_CLOSE);
3875 sk->err = EPIPE;
3876 sk->shutdown = SHUTDOWN_MASK;
3877 kfree_skb(skb, FREE_READ);
3878 return 0;
3879 }3880 }3881 }3882 }3883
3884 #endif3885
3886 /*3887 * Now we have to walk the chain, and figure out where this one3888 * goes into it. This is set up so that the last packet we received3889 * will be the first one we look at, that way if everything comes3890 * in order, there will be no performance loss, and if they come3891 * out of order we will be able to fit things in nicely.3892 *3893 * [AC: This is wrong. We should assume in order first and then walk3894 * forwards from the first hole based upon real traffic patterns.]3895 * 3896 */3897
3898 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */3899 {3900 skb_queue_head(&sk->receive_queue,skb);
3901 skb1= NULL;
3902 }3903 else3904 {3905 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3906 {3907 if(sk->debug)
3908 {3909 printk("skb1=%p :", skb1);
3910 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3911 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3912 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3913 sk->acked_seq);
3914 }3915
3916 /*3917 * Optimisation: Duplicate frame or extension of previous frame from3918 * same sequence point (lost ack case).3919 * The frame contains duplicate data or replaces a previous frame3920 * discard the previous frame (safe as sk->inuse is set) and put3921 * the new one in its place.3922 */3923
3924 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3925 {3926 skb_append(skb1,skb);
3927 skb_unlink(skb1);
3928 kfree_skb(skb1,FREE_READ);
3929 dup_dumped=1;
3930 skb1=NULL;
3931 break;
3932 }3933
3934 /*3935 * Found where it fits3936 */3937
3938 if (after(th->seq+1, skb1->h.th->seq))
3939 {3940 skb_append(skb1,skb);
3941 break;
3942 }3943
3944 /*3945 * See if we've hit the start. If so insert.3946 */3947 if (skb1 == skb_peek(&sk->receive_queue))
3948 {3949 skb_queue_head(&sk->receive_queue, skb);
3950 break;
3951 }3952 }3953 }3954
3955 /*3956 * Figure out what the ack value for this frame is3957 */3958
3959 th->ack_seq = th->seq + skb->len;
3960 if (th->syn)
3961 th->ack_seq++;
3962 if (th->fin)
3963 th->ack_seq++;
3964
3965 if (before(sk->acked_seq, sk->copied_seq))
3966 {3967 printk("*** tcp.c:tcp_data bug acked < copied\n");
3968 sk->acked_seq = sk->copied_seq;
3969 }3970
3971 /*3972 * Now figure out if we can ack anything. This is very messy because we really want two3973 * receive queues, a completed and an assembly queue. We also want only one transmit3974 * queue.3975 */3976
3977 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3978 {3979 if (before(th->seq, sk->acked_seq+1))
3980 {3981 intnewwindow;
3982
3983 if (after(th->ack_seq, sk->acked_seq))
3984 {3985 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3986 if (newwindow < 0)
3987 newwindow = 0;
3988 sk->window = newwindow;
3989 sk->acked_seq = th->ack_seq;
3990 }3991 skb->acked = 1;
3992
3993 /*3994 * When we ack the fin, we do the FIN 3995 * processing.3996 */3997
3998 if (skb->h.th->fin)
3999 {4000 tcp_fin(skb,sk,skb->h.th);
4001 }4002
4003 for(skb2 = skb->next;
4004 skb2 != (structsk_buff *)&sk->receive_queue;
4005 skb2 = skb2->next)
4006 {4007 if (before(skb2->h.th->seq, sk->acked_seq+1))
4008 {4009 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4010 {4011 newwindow = sk->window -
4012 (skb2->h.th->ack_seq - sk->acked_seq);
4013 if (newwindow < 0)
4014 newwindow = 0;
4015 sk->window = newwindow;
4016 sk->acked_seq = skb2->h.th->ack_seq;
4017 }4018 skb2->acked = 1;
4019 /*4020 * When we ack the fin, we do4021 * the fin handling.4022 */4023 if (skb2->h.th->fin)
4024 {4025 tcp_fin(skb,sk,skb->h.th);
4026 }4027
4028 /*4029 * Force an immediate ack.4030 */4031
4032 sk->ack_backlog = sk->max_ack_backlog;
4033 }4034 else4035 {4036 break;
4037 }4038 }4039
4040 /*4041 * This also takes care of updating the window.4042 * This if statement needs to be simplified.4043 */4044 if (!sk->delay_acks ||
4045 sk->ack_backlog >= sk->max_ack_backlog ||
4046 sk->bytes_rcv > sk->max_unacked || th->fin) {4047 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */4048 }4049 else4050 {4051 sk->ack_backlog++;
4052 if(sk->debug)
4053 printk("Ack queued.\n");
4054 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4055 }4056 }4057 }4058
4059 /*4060 * If we've missed a packet, send an ack.4061 * Also start a timer to send another.4062 */4063
4064 if (!skb->acked)
4065 {4066
4067 /*4068 * This is important. If we don't have much room left,4069 * we need to throw out a few packets so we have a good4070 * window. Note that mtu is used, not mss, because mss is really4071 * for the send side. He could be sending us stuff as large as mtu.4072 */4073
4074 while (sk->prot->rspace(sk) < sk->mtu)
4075 {4076 skb1 = skb_peek(&sk->receive_queue);
4077 if (skb1 == NULL)
4078 {4079 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4080 break;
4081 }4082
4083 /*4084 * Don't throw out something that has been acked. 4085 */4086
4087 if (skb1->acked)
4088 {4089 break;
4090 }4091
4092 skb_unlink(skb1);
4093 kfree_skb(skb1, FREE_READ);
4094 }4095 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4096 sk->ack_backlog++;
4097 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4098 }4099 else4100 {4101 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4102 }4103
4104 /*4105 * Now tell the user we may have some data. 4106 */4107
4108 if (!sk->dead)
4109 {4110 if(sk->debug)
4111 printk("Data wakeup.\n");
4112 sk->data_ready(sk,0);
4113 }4114 return(0);
4115 }4116
4117
4118 /*4119 * This routine is only called when we have urgent data4120 * signalled. Its the 'slow' part of tcp_urg. It could be4121 * moved inline now as tcp_urg is only called from one4122 * place. We handle URGent data wrong. We have to - as4123 * BSD still doesn't use the correction from RFC961.4124 */4125
4126 staticvoidtcp_check_urg(structsock * sk, structtcphdr * th)
/* */4127 {4128 unsignedlongptr = ntohs(th->urg_ptr);
4129
4130 if (ptr)
4131 ptr--;
4132 ptr += th->seq;
4133
4134 /* ignore urgent data that we've already seen and read */4135 if (after(sk->copied_seq, ptr))
4136 return;
4137
4138 /* do we already have a newer (or duplicate) urgent pointer? */4139 if (sk->urg_data && !after(ptr, sk->urg_seq))
4140 return;
4141
4142 /* tell the world about our new urgent pointer */4143 if (sk->proc != 0) {4144 if (sk->proc > 0) {4145 kill_proc(sk->proc, SIGURG, 1);
4146 }else{4147 kill_pg(-sk->proc, SIGURG, 1);
4148 }4149 }4150 sk->urg_data = URG_NOTYET;
4151 sk->urg_seq = ptr;
4152 }4153
4154 /*4155 * This is the 'fast' part of urgent handling.4156 */4157
4158 extern__inline__inttcp_urg(structsock *sk, structtcphdr *th,
/* */4159 unsignedlongsaddr, unsignedlonglen)
4160 {4161 unsignedlongptr;
4162
4163 /*4164 * Check if we get a new urgent pointer - normally not 4165 */4166
4167 if (th->urg)
4168 tcp_check_urg(sk,th);
4169
4170 /*4171 * Do we wait for any urgent data? - normally not4172 */4173
4174 if (sk->urg_data != URG_NOTYET)
4175 return 0;
4176
4177 /*4178 * Is the urgent pointer pointing into this packet? 4179 */4180
4181 ptr = sk->urg_seq - th->seq + th->doff*4;
4182 if (ptr >= len)
4183 return 0;
4184
4185 /*4186 * Ok, got the correct packet, update info 4187 */4188
4189 sk->urg_data = URG_VALID | *(ptr + (unsignedchar *) th);
4190 if (!sk->dead)
4191 sk->data_ready(sk,0);
4192 return 0;
4193 }4194
4195 /*4196 * This will accept the next outstanding connection. 4197 */4198
4199 staticstructsock *tcp_accept(structsock *sk, intflags)
/* */4200 {4201 structsock *newsk;
4202 structsk_buff *skb;
4203
4204 /*4205 * We need to make sure that this socket is listening,4206 * and that it has something pending.4207 */4208
4209 if (sk->state != TCP_LISTEN)
4210 {4211 sk->err = EINVAL;
4212 return(NULL);
4213 }4214
4215 /* Avoid the race. */4216 cli();
4217 sk->inuse = 1;
4218
4219 while((skb = tcp_dequeue_established(sk)) == NULL)
4220 {4221 if (flags & O_NONBLOCK)
4222 {4223 sti();
4224 release_sock(sk);
4225 sk->err = EAGAIN;
4226 return(NULL);
4227 }4228
4229 release_sock(sk);
4230 interruptible_sleep_on(sk->sleep);
4231 if (current->signal & ~current->blocked)
4232 {4233 sti();
4234 sk->err = ERESTARTSYS;
4235 return(NULL);
4236 }4237 sk->inuse = 1;
4238 }4239 sti();
4240
4241 /*4242 * Now all we need to do is return skb->sk. 4243 */4244
4245 newsk = skb->sk;
4246
4247 kfree_skb(skb, FREE_READ);
4248 sk->ack_backlog--;
4249 release_sock(sk);
4250 return(newsk);
4251 }4252
4253
4254 /*4255 * This will initiate an outgoing connection. 4256 */4257
4258 staticinttcp_connect(structsock *sk, structsockaddr_in *usin, intaddr_len)
/* */4259 {4260 structsk_buff *buff;
4261 structdevice *dev=NULL;
4262 unsignedchar *ptr;
4263 inttmp;
4264 intatype;
4265 structtcphdr *t1;
4266 structrtable *rt;
4267
4268 if (sk->state != TCP_CLOSE)
4269 {4270 return(-EISCONN);
4271 }4272
4273 if (addr_len < 8)
4274 return(-EINVAL);
4275
4276 if (usin->sin_family && usin->sin_family != AF_INET)
4277 return(-EAFNOSUPPORT);
4278
4279 /*4280 * connect() to INADDR_ANY means loopback (BSD'ism).4281 */4282
4283 if(usin->sin_addr.s_addr==INADDR_ANY)
4284 usin->sin_addr.s_addr=ip_my_addr();
4285
4286 /*4287 * Don't want a TCP connection going to a broadcast address 4288 */4289
4290 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4291 return -ENETUNREACH;
4292
4293 sk->inuse = 1;
4294 sk->daddr = usin->sin_addr.s_addr;
4295 sk->write_seq = tcp_init_seq();
4296 sk->window_seq = sk->write_seq;
4297 sk->rcv_ack_seq = sk->write_seq -1;
4298 sk->err = 0;
4299 sk->dummy_th.dest = usin->sin_port;
4300 release_sock(sk);
4301
4302 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4303 if (buff == NULL)
4304 {4305 return(-ENOMEM);
4306 }4307 sk->inuse = 1;
4308 buff->len = 24;
4309 buff->sk = sk;
4310 buff->free = 0;
4311 buff->localroute = sk->localroute;
4312
4313 t1 = (structtcphdr *) buff->data;
4314
4315 /*4316 * Put in the IP header and routing stuff. 4317 */4318
4319 rt=ip_rt_route(sk->daddr, NULL, NULL);
4320
4321
4322 /*4323 * We need to build the routing stuff from the things saved in skb. 4324 */4325
4326 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4327 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4328 if (tmp < 0)
4329 {4330 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4331 release_sock(sk);
4332 return(-ENETUNREACH);
4333 }4334
4335 buff->len += tmp;
4336 t1 = (structtcphdr *)((char *)t1 +tmp);
4337
4338 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4339 t1->seq = ntohl(sk->write_seq++);
4340 sk->sent_seq = sk->write_seq;
4341 buff->h.seq = sk->write_seq;
4342 t1->ack = 0;
4343 t1->window = 2;
4344 t1->res1=0;
4345 t1->res2=0;
4346 t1->rst = 0;
4347 t1->urg = 0;
4348 t1->psh = 0;
4349 t1->syn = 1;
4350 t1->urg_ptr = 0;
4351 t1->doff = 6;
4352 /* use 512 or whatever user asked for */4353
4354 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4355 sk->window_clamp=rt->rt_window;
4356 else4357 sk->window_clamp=0;
4358
4359 if (sk->user_mss)
4360 sk->mtu = sk->user_mss;
4361 elseif(rt!=NULL && (rt->rt_flags&RTF_MTU))
4362 sk->mtu = rt->rt_mss;
4363 else4364 {4365 #ifdefCONFIG_INET_SNARL4366 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4367 #else4368 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4369 #endif4370 sk->mtu = 576 - HEADER_SIZE;
4371 else4372 sk->mtu = MAX_WINDOW;
4373 }4374 /*4375 * but not bigger than device MTU 4376 */4377
4378 if(sk->mtu <32)
4379 sk->mtu = 32; /* Sanity limit */4380
4381 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4382
4383 /*4384 * Put in the TCP options to say MTU. 4385 */4386
4387 ptr = (unsignedchar *)(t1+1);
4388 ptr[0] = 2;
4389 ptr[1] = 4;
4390 ptr[2] = (sk->mtu) >> 8;
4391 ptr[3] = (sk->mtu) & 0xff;
4392 tcp_send_check(t1, sk->saddr, sk->daddr,
4393 sizeof(structtcphdr) + 4, sk);
4394
4395 /*4396 * This must go first otherwise a really quick response will get reset. 4397 */4398
4399 tcp_set_state(sk,TCP_SYN_SENT);
4400 sk->rto = TCP_TIMEOUT_INIT;
4401 #if 0 /* we already did this */4402 init_timer(&sk->retransmit_timer);
4403 #endif4404 sk->retransmit_timer.function=&retransmit_timer;
4405 sk->retransmit_timer.data = (unsignedlong)sk;
4406 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */4407 sk->retransmits = TCP_SYN_RETRIES;
4408
4409 sk->prot->queue_xmit(sk, dev, buff, 0);
4410 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4411 tcp_statistics.TcpActiveOpens++;
4412 tcp_statistics.TcpOutSegs++;
4413
4414 release_sock(sk);
4415 return(0);
4416 }4417
4418
4419 /* This functions checks to see if the tcp header is actually acceptable. */4420 extern__inline__inttcp_sequence(structsock *sk, structtcphdr *th, shortlen,
/* */4421 structoptions *opt, unsignedlongsaddr, structdevice *dev)
4422 {4423 unsignedlongnext_seq;
4424
4425 next_seq = len - 4*th->doff;
4426 if (th->fin)
4427 next_seq++;
4428 /* if we have a zero window, we can't have any data in the packet.. */4429 if (next_seq && !sk->window)
4430 gotoignore_it;
4431 next_seq += th->seq;
4432
4433 /*4434 * This isn't quite right. sk->acked_seq could be more recent4435 * than sk->window. This is however close enough. We will accept4436 * slightly more packets than we should, but it should not cause4437 * problems unless someone is trying to forge packets.4438 */4439
4440 /* have we already seen all of this packet? */4441 if (!after(next_seq+1, sk->acked_seq))
4442 gotoignore_it;
4443 /* or does it start beyond the window? */4444 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4445 gotoignore_it;
4446
4447 /* ok, at least part of this packet would seem interesting.. */4448 return 1;
4449
4450 ignore_it:
4451 if (th->rst)
4452 return 0;
4453
4454 /*4455 * Send a reset if we get something not ours and we are4456 * unsynchronized. Note: We don't do anything to our end. We4457 * are just killing the bogus remote connection then we will4458 * connect again and it will work (with luck).4459 */4460
4461 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4462 {4463 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4464 return 1;
4465 }4466
4467 /* Try to resync things. */4468 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4469 return 0;
4470 }4471
4472 /*4473 * When we get a reset we do this.4474 */4475
4476 staticinttcp_std_reset(structsock *sk, structsk_buff *skb)
/* */4477 {4478 sk->zapped = 1;
4479 sk->err = ECONNRESET;
4480 if (sk->state == TCP_SYN_SENT)
4481 sk->err = ECONNREFUSED;
4482 if (sk->state == TCP_CLOSE_WAIT)
4483 sk->err = EPIPE;
4484 #ifdef TCP_DO_RFC1337
4485 /*4486 * Time wait assassination protection [RFC1337]4487 */4488 if(sk->state!=TCP_TIME_WAIT)
4489 {4490 tcp_set_state(sk,TCP_CLOSE);
4491 sk->shutdown = SHUTDOWN_MASK;
4492 }4493 #else4494 tcp_set_state(sk,TCP_CLOSE);
4495 sk->shutdown = SHUTDOWN_MASK;
4496 #endif4497 if (!sk->dead)
4498 sk->state_change(sk);
4499 kfree_skb(skb, FREE_READ);
4500 release_sock(sk);
4501 return(0);
4502 }4503
4504 /*4505 * A TCP packet has arrived.4506 */4507
4508 inttcp_rcv(structsk_buff *skb, structdevice *dev, structoptions *opt,
/* */4509 unsignedlongdaddr, unsignedshortlen,
4510 unsignedlongsaddr, intredo, structinet_protocol * protocol)
4511 {4512 structtcphdr *th;
4513 structsock *sk;
4514 intsyn_ok=0;
4515
4516 if (!skb)
4517 {4518 printk("IMPOSSIBLE 1\n");
4519 return(0);
4520 }4521
4522 if (!dev)
4523 {4524 printk("IMPOSSIBLE 2\n");
4525 return(0);
4526 }4527
4528 tcp_statistics.TcpInSegs++;
4529
4530 if(skb->pkt_type!=PACKET_HOST)
4531 {4532 kfree_skb(skb,FREE_READ);
4533 return(0);
4534 }4535
4536 th = skb->h.th;
4537
4538 /*4539 * Find the socket.4540 */4541
4542 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4543
4544 /*4545 * If this socket has got a reset it's to all intents and purposes 4546 * really dead. Count closed sockets as dead.4547 *4548 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD4549 * simply drops data. This seems incorrect as a 'closed' TCP doesn't4550 * exist so should cause resets as if the port was unreachable.4551 */4552
4553 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4554 sk=NULL;
4555
4556 if (!redo)
4557 {4558 if (tcp_check(th, len, saddr, daddr ))
4559 {4560 skb->sk = NULL;
4561 kfree_skb(skb,FREE_READ);
4562 /*4563 * We don't release the socket because it was4564 * never marked in use.4565 */4566 return(0);
4567 }4568 th->seq = ntohl(th->seq);
4569
4570 /* See if we know about the socket. */4571 if (sk == NULL)
4572 {4573 /*4574 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)4575 */4576 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4577 skb->sk = NULL;
4578 /*4579 * Discard frame4580 */4581 kfree_skb(skb, FREE_READ);
4582 return(0);
4583 }4584
4585 skb->len = len;
4586 skb->acked = 0;
4587 skb->used = 0;
4588 skb->free = 0;
4589 skb->saddr = daddr;
4590 skb->daddr = saddr;
4591
4592 /* We may need to add it to the backlog here. */4593 cli();
4594 if (sk->inuse)
4595 {4596 skb_queue_tail(&sk->back_log, skb);
4597 sti();
4598 return(0);
4599 }4600 sk->inuse = 1;
4601 sti();
4602 }4603 else4604 {4605 if (sk==NULL)
4606 {4607 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4608 skb->sk = NULL;
4609 kfree_skb(skb, FREE_READ);
4610 return(0);
4611 }4612 }4613
4614
4615 if (!sk->prot)
4616 {4617 printk("IMPOSSIBLE 3\n");
4618 return(0);
4619 }4620
4621
4622 /*4623 * Charge the memory to the socket. 4624 */4625
4626 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
4627 {4628 kfree_skb(skb, FREE_READ);
4629 release_sock(sk);
4630 return(0);
4631 }4632
4633 skb->sk=sk;
4634 sk->rmem_alloc += skb->mem_len;
4635
4636 /*4637 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We4638 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug4639 * compatibility. We also set up variables more thoroughly [Karn notes in the4640 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].4641 */4642
4643 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */4644 {4645
4646 /*4647 * Now deal with unusual cases.4648 */4649
4650 if(sk->state==TCP_LISTEN)
4651 {4652 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */4653 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4654
4655 /*4656 * We don't care for RST, and non SYN are absorbed (old segments)4657 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the4658 * netmask on a running connection it can go broadcast. Even Sun's have4659 * this problem so I'm ignoring it 4660 */4661
4662 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4663 {4664 kfree_skb(skb, FREE_READ);
4665 release_sock(sk);
4666 return 0;
4667 }4668
4669 /* 4670 * Guess we need to make a new socket up 4671 */4672
4673 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4674
4675 /*4676 * Now we have several options: In theory there is nothing else4677 * in the frame. KA9Q has an option to send data with the syn,4678 * BSD accepts data with the syn up to the [to be] advertised window4679 * and Solaris 2.1 gives you a protocol error. For now we just ignore4680 * it, that fits the spec precisely and avoids incompatibilities. It4681 * would be nice in future to drop through and process the data.4682 */4683
4684 release_sock(sk);
4685 return 0;
4686 }4687
4688 /* retransmitted SYN? */4689 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4690 {4691 kfree_skb(skb, FREE_READ);
4692 release_sock(sk);
4693 return 0;
4694 }4695
4696 /*4697 * SYN sent means we have to look for a suitable ack and either reset4698 * for bad matches or go to connected 4699 */4700
4701 if(sk->state==TCP_SYN_SENT)
4702 {4703 /* Crossed SYN or previous junk segment */4704 if(th->ack)
4705 {4706 /* We got an ack, but it's not a good ack */4707 if(!tcp_ack(sk,th,saddr,len))
4708 {4709 /* Reset the ack - its an ack from a 4710 different connection [ th->rst is checked in tcp_reset()] */4711 tcp_statistics.TcpAttemptFails++;
4712 tcp_reset(daddr, saddr, th,
4713 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4714 kfree_skb(skb, FREE_READ);
4715 release_sock(sk);
4716 return(0);
4717 }4718 if(th->rst)
4719 returntcp_std_reset(sk,skb);
4720 if(!th->syn)
4721 {4722 /* A valid ack from a different connection4723 start. Shouldn't happen but cover it */4724 kfree_skb(skb, FREE_READ);
4725 release_sock(sk);
4726 return 0;
4727 }4728 /*4729 * Ok.. it's good. Set up sequence numbers and4730 * move to established.4731 */4732 syn_ok=1; /* Don't reset this connection for the syn */4733 sk->acked_seq=th->seq+1;
4734 sk->fin_seq=th->seq;
4735 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4736 tcp_set_state(sk, TCP_ESTABLISHED);
4737 tcp_options(sk,th);
4738 sk->dummy_th.dest=th->source;
4739 sk->copied_seq = sk->acked_seq;
4740 if(!sk->dead)
4741 {4742 sk->state_change(sk);
4743 sock_wake_async(sk->socket, 0);
4744 }4745 if(sk->max_window==0)
4746 {4747 sk->max_window = 32;
4748 sk->mss = min(sk->max_window, sk->mtu);
4749 }4750 }4751 else4752 {4753 /* See if SYN's cross. Drop if boring */4754 if(th->syn && !th->rst)
4755 {4756 /* Crossed SYN's are fine - but talking to4757 yourself is right out... */4758 if(sk->saddr==saddr && sk->daddr==daddr &&
4759 sk->dummy_th.source==th->source &&
4760 sk->dummy_th.dest==th->dest)
4761 {4762 tcp_statistics.TcpAttemptFails++;
4763 returntcp_std_reset(sk,skb);
4764 }4765 tcp_set_state(sk,TCP_SYN_RECV);
4766
4767 /*4768 * FIXME:4769 * Must send SYN|ACK here4770 */4771 }4772 /* Discard junk segment */4773 kfree_skb(skb, FREE_READ);
4774 release_sock(sk);
4775 return 0;
4776 }4777 /*4778 * SYN_RECV with data maybe.. drop through4779 */4780 gotorfc_step6;
4781 }4782
4783 /*4784 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is4785 * a more complex suggestion for fixing these reuse issues in RFC16444786 * but not yet ready for general use. Also see RFC1379.4787 */4788
4789 #defineBSD_TIME_WAIT4790 #ifdefBSD_TIME_WAIT4791 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4792 after(th->seq, sk->acked_seq) && !th->rst)
4793 {4794 longseq=sk->write_seq;
4795 if(sk->debug)
4796 printk("Doing a BSD time wait\n");
4797 tcp_statistics.TcpEstabResets++;
4798 sk->rmem_alloc -= skb->mem_len;
4799 skb->sk = NULL;
4800 sk->err=ECONNRESET;
4801 tcp_set_state(sk, TCP_CLOSE);
4802 sk->shutdown = SHUTDOWN_MASK;
4803 release_sock(sk);
4804 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4805 if (sk && sk->state==TCP_LISTEN)
4806 {4807 sk->inuse=1;
4808 skb->sk = sk;
4809 sk->rmem_alloc += skb->mem_len;
4810 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4811 release_sock(sk);
4812 return 0;
4813 }4814 kfree_skb(skb, FREE_READ);
4815 return 0;
4816 }4817 #endif4818 }4819
4820 /*4821 * We are now in normal data flow (see the step list in the RFC)4822 * Note most of these are inline now. I'll inline the lot when4823 * I have time to test it hard and look at what gcc outputs 4824 */4825
4826 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4827 {4828 kfree_skb(skb, FREE_READ);
4829 release_sock(sk);
4830 return 0;
4831 }4832
4833 if(th->rst)
4834 returntcp_std_reset(sk,skb);
4835
4836 /*4837 * !syn_ok is effectively the state test in RFC793.4838 */4839
4840 if(th->syn && !syn_ok)
4841 {4842 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4843 returntcp_std_reset(sk,skb);
4844 }4845
4846 /*4847 * Process the ACK4848 */4849
4850
4851 if(th->ack && !tcp_ack(sk,th,saddr,len))
4852 {4853 /*4854 * Our three way handshake failed.4855 */4856
4857 if(sk->state==TCP_SYN_RECV)
4858 {4859 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4860 }4861 kfree_skb(skb, FREE_READ);
4862 release_sock(sk);
4863 return 0;
4864 }4865
4866 rfc_step6: /* I'll clean this up later */4867
4868 /*4869 * Process urgent data4870 */4871
4872 if(tcp_urg(sk, th, saddr, len))
4873 {4874 kfree_skb(skb, FREE_READ);
4875 release_sock(sk);
4876 return 0;
4877 }4878
4879
4880 /*4881 * Process the encapsulated data4882 */4883
4884 if(tcp_data(skb,sk, saddr, len))
4885 {4886 kfree_skb(skb, FREE_READ);
4887 release_sock(sk);
4888 return 0;
4889 }4890
4891 /*4892 * And done4893 */4894
4895 release_sock(sk);
4896 return 0;
4897 }4898
4899 /*4900 * This routine sends a packet with an out of date sequence4901 * number. It assumes the other end will try to ack it.4902 */4903
4904 staticvoidtcp_write_wakeup(structsock *sk)
/* */4905 {4906 structsk_buff *buff;
4907 structtcphdr *t1;
4908 structdevice *dev=NULL;
4909 inttmp;
4910
4911 if (sk->zapped)
4912 return; /* After a valid reset we can send no more */4913
4914 /*4915 * Write data can still be transmitted/retransmitted in the4916 * following states. If any other state is encountered, return.4917 * [listen/close will never occur here anyway]4918 */4919
4920 if (sk->state != TCP_ESTABLISHED &&
4921 sk->state != TCP_CLOSE_WAIT &&
4922 sk->state != TCP_FIN_WAIT1 &&
4923 sk->state != TCP_LAST_ACK &&
4924 sk->state != TCP_CLOSING4925 )
4926 {4927 return;
4928 }4929
4930 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4931 if (buff == NULL)
4932 return;
4933
4934 buff->len = sizeof(structtcphdr);
4935 buff->free = 1;
4936 buff->sk = sk;
4937 buff->localroute = sk->localroute;
4938
4939 t1 = (structtcphdr *) buff->data;
4940
4941 /* Put in the IP header and routing stuff. */4942 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4943 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4944 if (tmp < 0)
4945 {4946 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4947 return;
4948 }4949
4950 buff->len += tmp;
4951 t1 = (structtcphdr *)((char *)t1 +tmp);
4952
4953 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4954
4955 /*4956 * Use a previous sequence.4957 * This should cause the other end to send an ack.4958 */4959
4960 t1->seq = htonl(sk->sent_seq-1);
4961 t1->ack = 1;
4962 t1->res1= 0;
4963 t1->res2= 0;
4964 t1->rst = 0;
4965 t1->urg = 0;
4966 t1->psh = 0;
4967 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */4968 t1->syn = 0;
4969 t1->ack_seq = ntohl(sk->acked_seq);
4970 t1->window = ntohs(tcp_select_window(sk));
4971 t1->doff = sizeof(*t1)/4;
4972 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4973 /*4974 * Send it and free it.4975 * This will prevent the timer from automatically being restarted.4976 */4977 sk->prot->queue_xmit(sk, dev, buff, 1);
4978 tcp_statistics.TcpOutSegs++;
4979 }4980
4981 /*4982 * A window probe timeout has occurred.4983 */4984
4985 voidtcp_send_probe0(structsock *sk)
/* */4986 {4987 if (sk->zapped)
4988 return; /* After a valid reset we can send no more */4989
4990 tcp_write_wakeup(sk);
4991
4992 sk->backoff++;
4993 sk->rto = min(sk->rto << 1, 120*HZ);
4994 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4995 sk->retransmits++;
4996 sk->prot->retransmits ++;
4997 }4998
4999 /*5000 * Socket option code for TCP. 5001 */5002
5003 inttcp_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */5004 {5005 intval,err;
5006
5007 if(level!=SOL_TCP)
5008 returnip_setsockopt(sk,level,optname,optval,optlen);
5009
5010 if (optval == NULL)
5011 return(-EINVAL);
5012
5013 err=verify_area(VERIFY_READ, optval, sizeof(int));
5014 if(err)
5015 returnerr;
5016
5017 val = get_fs_long((unsignedlong *)optval);
5018
5019 switch(optname)
5020 {5021 caseTCP_MAXSEG:
5022 /*5023 * values greater than interface MTU won't take effect. however at5024 * the point when this call is done we typically don't yet know5025 * which interface is going to be used5026 */5027 if(val<1||val>MAX_WINDOW)
5028 return -EINVAL;
5029 sk->user_mss=val;
5030 return 0;
5031 caseTCP_NODELAY:
5032 sk->nonagle=(val==0)?0:1;
5033 return 0;
5034 default:
5035 return(-ENOPROTOOPT);
5036 }5037 }5038
5039 inttcp_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */5040 {5041 intval,err;
5042
5043 if(level!=SOL_TCP)
5044 returnip_getsockopt(sk,level,optname,optval,optlen);
5045
5046 switch(optname)
5047 {5048 caseTCP_MAXSEG:
5049 val=sk->user_mss;
5050 break;
5051 caseTCP_NODELAY:
5052 val=sk->nonagle;
5053 break;
5054 default:
5055 return(-ENOPROTOOPT);
5056 }5057 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5058 if(err)
5059 returnerr;
5060 put_fs_long(sizeof(int),(unsignedlong *) optlen);
5061
5062 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5063 if(err)
5064 returnerr;
5065 put_fs_long(val,(unsignedlong *)optval);
5066
5067 return(0);
5068 }5069
5070
5071 structprototcp_prot = {5072 sock_wmalloc,
5073 sock_rmalloc,
5074 sock_wfree,
5075 sock_rfree,
5076 sock_rspace,
5077 sock_wspace,
5078 tcp_close,
5079 tcp_read,
5080 tcp_write,
5081 tcp_sendto,
5082 tcp_recvfrom,
5083 ip_build_header,
5084 tcp_connect,
5085 tcp_accept,
5086 ip_queue_xmit,
5087 tcp_retransmit,
5088 tcp_write_wakeup,
5089 tcp_read_wakeup,
5090 tcp_rcv,
5091 tcp_select,
5092 tcp_ioctl,
5093 NULL,
5094 tcp_shutdown,
5095 tcp_setsockopt,
5096 tcp_getsockopt,
5097 128,
5098 0,
5099 {NULL,},
5100 "TCP",
5101 0, 0
5102 };