1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@no.unit.nvg>
20 *
21 * Fixes:
22 * Alan Cox : Numerous verify_area() calls
23 * Alan Cox : Set the ACK bit on a reset
24 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1
25 * and was trying to connect (tcp_err()).
26 * Alan Cox : All icmp error handling was broken
27 * pointers passed where wrong and the
28 * socket was looked up backwards. Nobody
29 * tested any icmp error code obviously.
30 * Alan Cox : tcp_err() now handled properly. It wakes people
31 * on errors. select behaves and the icmp error race
32 * has gone by moving it into sock.c
33 * Alan Cox : tcp_reset() fixed to work for everything not just
34 * packets for unknown sockets.
35 * Alan Cox : tcp option processing.
36 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong]
37 * Herp Rosmanith : More reset fixes
38 * Alan Cox : No longer acks invalid rst frames. Acking
39 * any kind of RST is right out.
40 * Alan Cox : Sets an ignore me flag on an rst receive
41 * otherwise odd bits of prattle escape still
42 * Alan Cox : Fixed another acking RST frame bug. Should stop
43 * LAN workplace lockups.
44 * Alan Cox : Some tidyups using the new skb list facilities
45 * Alan Cox : sk->keepopen now seems to work
46 * Alan Cox : Pulls options out correctly on accepts
47 * Alan Cox : Fixed assorted sk->rqueue->next errors
48 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops.
49 * Alan Cox : Tidied tcp_data to avoid a potential nasty.
50 * Alan Cox : Added some better commenting, as the tcp is hard to follow
51 * Alan Cox : Removed incorrect check for 20 * psh
52 * Michael O'Reilly : ack < copied bug fix.
53 * Johannes Stille : Misc tcp fixes (not all in yet).
54 * Alan Cox : FIN with no memory -> CRASH
55 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept.
56 * Alan Cox : Added TCP options (SOL_TCP)
57 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets.
58 * Alan Cox : Use ip_tos/ip_ttl settings.
59 * Alan Cox : Handle FIN (more) properly (we hope).
60 * Alan Cox : RST frames sent on unsynchronised state ack error/
61 * Alan Cox : Put in missing check for SYN bit.
62 * Alan Cox : Added tcp_select_window() aka NET2E
63 * window non shrink trick.
64 * Alan Cox : Added a couple of small NET2E timer fixes
65 * Charles Hedrick : TCP fixes
66 * Toomas Tamm : TCP window fixes
67 * Alan Cox : Small URG fix to rlogin ^C ack fight
68 * Charles Hedrick : Rewrote most of it to actually work
69 * Linus : Rewrote tcp_read() and URG handling
70 * completely
71 * Gerhard Koerting: Fixed some missing timer handling
72 * Matthew Dillon : Reworked TCP machine states as per RFC
73 * Gerhard Koerting: PC/TCP workarounds
74 * Adam Caldwell : Assorted timer/timing errors
75 * Matthew Dillon : Fixed another RST bug
76 * Alan Cox : Move to kernel side addressing changes.
77 * Alan Cox : Beginning work on TCP fastpathing (not yet usable)
78 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
79 * Alan Cox : TCP fast path debugging
80 * Alan Cox : Window clamping
81 * Michael Riepe : Bug in tcp_check()
82 * Matt Dillon : More TCP improvements and RST bug fixes
83 * Matt Dillon : Yet more small nasties remove from the TCP code
84 * (Be very nice to this man if tcp finally works 100%) 8)
85 * Alan Cox : BSD accept semantics.
86 * Alan Cox : Reset on closedown bug.
87 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
88 * Michael Pall : Handle select() after URG properly in all cases.
89 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
90 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
91 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api.
92 * Alan Cox : Changed the semantics of sk->socket to
93 * fix a race and a signal problem with
94 * accept() and async I/O.
95 * Alan Cox : Relaxed the rules on tcp_sendto().
96 * Yury Shevchuk : Really fixed accept() blocking problem.
97 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
98 * clients/servers which listen in on
99 * fixed ports.
100 * Alan Cox : Cleaned the above up and shrank it to
101 * a sensible code size.
102 * Alan Cox : Self connect lockup fix.
103 * Alan Cox : No connect to multicast.
104 * Ross Biro : Close unaccepted children on master
105 * socket close.
106 * Alan Cox : Reset tracing code.
107 * Alan Cox : Spurious resets on shutdown.
108 * Alan Cox : Giant 15 minute/60 second timer error
109 * Alan Cox : Small whoops in selecting before an accept.
110 * Alan Cox : Kept the state trace facility since it's
111 * handy for debugging.
112 * Alan Cox : More reset handler fixes.
113 * Alan Cox : Started rewriting the code based on the RFC's
114 * for other useful protocol references see:
115 * Comer, KA9Q NOS, and for a reference on the
116 * difference between specifications and how BSD
117 * works see the 4.4lite source.
118 * A.N.Kuznetsov : Don't time wait on completion of tidy
119 * close.
120 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
121 * Linus Torvalds : Fixed BSD port reuse to work first syn
122 * Alan Cox : Reimplemented timers as per the RFC and using multiple
123 * timers for sanity.
124 * Alan Cox : Small bug fixes, and a lot of new
125 * comments.
126 * Alan Cox : Fixed dual reader crash by locking
127 * the buffers (much like datagram.c)
128 * Alan Cox : Fixed stuck sockets in probe. A probe
129 * now gets fed up of retrying without
130 * (even a no space) answer.
131 * Alan Cox : Extracted closing code better
132 * Alan Cox : Fixed the closing state machine to
133 * resemble the RFC.
134 * Alan Cox : More 'per spec' fixes.
135 * Alan Cox : tcp_data() doesn't ack illegal PSH
136 * only frames. At least one pc tcp stack
137 * generates them.
138 *
139 *
140 * To Fix:
141 * Fast path the code. Two things here - fix the window calculation
142 * so it doesn't iterate over the queue, also spot packets with no funny
143 * options arriving in order and process directly.
144 *
145 * Implement RFC 1191 [Path MTU discovery]
146 * Look at the effect of implementing RFC 1337 suggestions and their impact.
147 * Rewrite output state machine to use a single queue and do low window
148 * situations as per the spec (RFC 1122)
149 * Speed up input assembly algorithm.
150 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
151 * could do with it working on IPv4
152 * User settable/learned rtt/max window/mtu
153 * Cope with MTU/device switches when retransmitting in tcp.
154 * Fix the window handling to use PR's new code.
155 *
156 * Change the fundamental structure to a single send queue maintained
157 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on
158 * active routes too]). Cut the queue off in tcp_retransmit/
159 * tcp_transmit.
160 * Change the receive queue to assemble as it goes. This lets us
161 * dispose of most of tcp_sequence, half of tcp_ack and chunks of
162 * tcp_data/tcp_read as well as the window shrink crud.
163 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
164 * tcp_queue_skb seem obvious routines to extract.
165 *
166 * This program is free software; you can redistribute it and/or
167 * modify it under the terms of the GNU General Public License
168 * as published by the Free Software Foundation; either version
169 * 2 of the License, or(at your option) any later version.
170 *
171 * Description of States:
172 *
173 * TCP_SYN_SENT sent a connection request, waiting for ack
174 *
175 * TCP_SYN_RECV received a connection request, sent ack,
176 * waiting for final ack in three-way handshake.
177 *
178 * TCP_ESTABLISHED connection established
179 *
180 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
181 * transmission of remaining buffered data
182 *
183 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
184 * to shutdown
185 *
186 * TCP_CLOSING both sides have shutdown but we still have
187 * data we have to finish sending
188 *
189 * TCP_TIME_WAIT timeout to catch resent junk before entering
190 * closed, can only be entered from FIN_WAIT2
191 * or CLOSING. Required because the other end
192 * may not have gotten our last ACK causing it
193 * to retransmit the data packet (which we ignore)
194 *
195 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
196 * us to finish writing our data and to shutdown
197 * (we have to close() to move on to LAST_ACK)
198 *
199 * TCP_LAST_ACK out side has shutdown after remote has
200 * shutdown. There may still be data in our
201 * buffer that we have to finish sending
202 *
203 * TCP_CLOSE socket is finished
204 */
205
206 #include <linux/types.h>
207 #include <linux/sched.h>
208 #include <linux/mm.h>
209 #include <linux/time.h>
210 #include <linux/string.h>
211 #include <linux/config.h>
212 #include <linux/socket.h>
213 #include <linux/sockios.h>
214 #include <linux/termios.h>
215 #include <linux/in.h>
216 #include <linux/fcntl.h>
217 #include <linux/inet.h>
218 #include <linux/netdevice.h>
219 #include "snmp.h"
220 #include "ip.h"
221 #include "protocol.h"
222 #include "icmp.h"
223 #include "tcp.h"
224 #include "arp.h"
225 #include <linux/skbuff.h>
226 #include "sock.h"
227 #include "route.h"
228 #include <linux/errno.h>
229 #include <linux/timer.h>
230 #include <asm/system.h>
231 #include <asm/segment.h>
232 #include <linux/mm.h>
233
234 /*
235 * The MSL timer is the 'normal' timer.
236 */
237
238 #define reset_msl_timer(x,y,z) reset_timer(x,y,z)
239
240 #define SEQ_TICK 3
241 unsigned long seq_offset;
242 struct tcp_mib tcp_statistics;
243
244 static void tcp_close(struct sock *sk, int timeout);
245
246
247 /*
248 * The less said about this the better, but it works and will do for 1.2
249 */
250
251 static struct wait_queue *master_select_wakeup;
252
253 static __inline__ int min(unsigned int a, unsigned int b)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
254 {
255 if (a < b)
256 return(a);
257 return(b);
258 }
259
260 #undef STATE_TRACE
261
262 #ifdef STATE_TRACE
263 static char *statename[]={
264 "Unused","Established","Syn Sent","Syn Recv",
265 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
266 "Close Wait","Last ACK","Listen","Closing"
267 };
268 #endif
269
270 static __inline__ void tcp_set_state(struct sock *sk, int state)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
271 {
272 if(sk->state==TCP_ESTABLISHED)
273 tcp_statistics.TcpCurrEstab--;
274 #ifdef STATE_TRACE
275 if(sk->debug)
276 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
277 #endif
278 /* This is a hack but it doesn't occur often and it's going to
279 be a real to fix nicely */
280
281 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
282 {
283 wake_up_interruptible(&master_select_wakeup);
284 }
285 sk->state=state;
286 if(state==TCP_ESTABLISHED)
287 tcp_statistics.TcpCurrEstab++;
288 }
289
290 /*
291 * This routine picks a TCP windows for a socket based on
292 * the following constraints
293 *
294 * 1. The window can never be shrunk once it is offered (RFC 793)
295 * 2. We limit memory per socket
296 *
297 * For now we use NET2E3's heuristic of offering half the memory
298 * we have handy. All is not as bad as this seems however because
299 * of two things. Firstly we will bin packets even within the window
300 * in order to get the data we are waiting for into the memory limit.
301 * Secondly we bin common duplicate forms at receive time
302 * Better heuristics welcome
303 */
304
305 int tcp_select_window(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
306 {
307 int new_window = sk->prot->rspace(sk);
308
309 if(sk->window_clamp)
310 new_window=min(sk->window_clamp,new_window);
311 /*
312 * Two things are going on here. First, we don't ever offer a
313 * window less than min(sk->mss, MAX_WINDOW/2). This is the
314 * receiver side of SWS as specified in RFC1122.
315 * Second, we always give them at least the window they
316 * had before, in order to avoid retracting window. This
317 * is technically allowed, but RFC1122 advises against it and
318 * in practice it causes trouble.
319 *
320 * Fixme: This doesn't correctly handle the case where
321 * new_window > sk->window but not by enough to allow for the
322 * shift in sequence space.
323 */
324 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
325 return(sk->window);
326 return(new_window);
327 }
328
329 /*
330 * Find someone to 'accept'. Must be called with
331 * sk->inuse=1 or cli()
332 */
333
334 static struct sk_buff *tcp_find_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
335 {
336 struct sk_buff *p=skb_peek(&s->receive_queue);
337 if(p==NULL)
338 return NULL;
339 do
340 {
341 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
342 return p;
343 p=p->next;
344 }
345 while(p!=(struct sk_buff *)&s->receive_queue);
346 return NULL;
347 }
348
349 /*
350 * Remove a completed connection and return it. This is used by
351 * tcp_accept() to get connections from the queue.
352 */
353
354 static struct sk_buff *tcp_dequeue_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
355 {
356 struct sk_buff *skb;
357 unsigned long flags;
358 save_flags(flags);
359 cli();
360 skb=tcp_find_established(s);
361 if(skb!=NULL)
362 skb_unlink(skb); /* Take it off the queue */
363 restore_flags(flags);
364 return skb;
365 }
366
367 /*
368 * This routine closes sockets which have been at least partially
369 * opened, but not yet accepted. Currently it is only called by
370 * tcp_close, and timeout mirrors the value there.
371 */
372
373 static void tcp_close_pending (struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
374 {
375 struct sk_buff *skb;
376
377 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
378 {
379 skb->sk->dead=1;
380 tcp_close(skb->sk, 0);
381 kfree_skb(skb, FREE_READ);
382 }
383 return;
384 }
385
386 /*
387 * Enter the time wait state.
388 */
389
390 static void tcp_time_wait(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
391 {
392 tcp_set_state(sk,TCP_TIME_WAIT);
393 sk->shutdown = SHUTDOWN_MASK;
394 if (!sk->dead)
395 sk->state_change(sk);
396 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
397 }
398
399 /*
400 * A socket has timed out on its send queue and wants to do a
401 * little retransmitting. Currently this means TCP.
402 */
403
404 void tcp_do_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
405 {
406 struct sk_buff * skb;
407 struct proto *prot;
408 struct device *dev;
409 int ct=0;
410
411 prot = sk->prot;
412 skb = sk->send_head;
413
414 while (skb != NULL)
415 {
416 struct tcphdr *th;
417 struct iphdr *iph;
418 int size;
419
420 dev = skb->dev;
421 IS_SKB(skb);
422 skb->when = jiffies;
423
424 /*
425 * In general it's OK just to use the old packet. However we
426 * need to use the current ack and window fields. Urg and
427 * urg_ptr could possibly stand to be updated as well, but we
428 * don't keep the necessary data. That shouldn't be a problem,
429 * if the other end is doing the right thing. Since we're
430 * changing the packet, we have to issue a new IP identifier.
431 */
432
433 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
434 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
435 size = skb->len - (((unsigned char *) th) - skb->data);
436
437 /*
438 * Note: We ought to check for window limits here but
439 * currently this is done (less efficiently) elsewhere.
440 * We do need to check for a route change but can't handle
441 * that until we have the new 1.3.x buffers in.
442 *
443 */
444
445 iph->id = htons(ip_id_count++);
446 ip_send_check(iph);
447
448 /*
449 * This is not the right way to handle this. We have to
450 * issue an up to date window and ack report with this
451 * retransmit to keep the odd buggy tcp that relies on
452 * the fact BSD does this happy.
453 * We don't however need to recalculate the entire
454 * checksum, so someone wanting a small problem to play
455 * with might like to implement RFC1141/RFC1624 and speed
456 * this up by avoiding a full checksum.
457 */
458
459 th->ack_seq = ntohl(sk->acked_seq);
460 th->window = ntohs(tcp_select_window(sk));
461 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
462
463 /*
464 * If the interface is (still) up and running, kick it.
465 */
466
467 if (dev->flags & IFF_UP)
468 {
469 /*
470 * If the packet is still being sent by the device/protocol
471 * below then don't retransmit. This is both needed, and good -
472 * especially with connected mode AX.25 where it stops resends
473 * occurring of an as yet unsent anyway frame!
474 * We still add up the counts as the round trip time wants
475 * adjusting.
476 */
477 if (sk && !skb_device_locked(skb))
478 {
479 /* Remove it from any existing driver queue first! */
480 skb_unlink(skb);
481 /* Now queue it */
482 ip_statistics.IpOutRequests++;
483 dev_queue_xmit(skb, dev, sk->priority);
484 }
485 }
486
487 /*
488 * Count retransmissions
489 */
490
491 ct++;
492 sk->prot->retransmits ++;
493
494 /*
495 * Only one retransmit requested.
496 */
497
498 if (!all)
499 break;
500
501 /*
502 * This should cut it off before we send too many packets.
503 */
504
505 if (ct >= sk->cong_window)
506 break;
507 skb = skb->link3;
508 }
509 }
510
511 /*
512 * Reset the retransmission timer
513 */
514
515 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
516 {
517 del_timer(&sk->retransmit_timer);
518 sk->ip_xmit_timeout = why;
519 if((int)when < 0)
520 {
521 when=3;
522 printk("Error: Negative timer in xmit_timer\n");
523 }
524 sk->retransmit_timer.expires=when;
525 add_timer(&sk->retransmit_timer);
526 }
527
528 /*
529 * This is the normal code called for timeouts. It does the retransmission
530 * and then does backoff. tcp_do_retransmit is separated out because
531 * tcp_ack needs to send stuff from the retransmit queue without
532 * initiating a backoff.
533 */
534
535
536 void tcp_retransmit_time(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
537 {
538 tcp_do_retransmit(sk, all);
539
540 /*
541 * Increase the timeout each time we retransmit. Note that
542 * we do not increase the rtt estimate. rto is initialized
543 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
544 * that doubling rto each time is the least we can get away with.
545 * In KA9Q, Karn uses this for the first few times, and then
546 * goes to quadratic. netBSD doubles, but only goes up to *64,
547 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
548 * defined in the protocol as the maximum possible RTT. I guess
549 * we'll have to use something other than TCP to talk to the
550 * University of Mars.
551 *
552 * PAWS allows us longer timeouts and large windows, so once
553 * implemented ftp to mars will work nicely. We will have to fix
554 * the 120 second clamps though!
555 */
556
557 sk->retransmits++;
558 sk->backoff++;
559 sk->rto = min(sk->rto << 1, 120*HZ);
560 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
561 }
562
563
564 /*
565 * A timer event has trigger a tcp retransmit timeout. The
566 * socket xmit queue is ready and set up to send. Because
567 * the ack receive code keeps the queue straight we do
568 * nothing clever here.
569 */
570
571 static void tcp_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
572 {
573 if (all)
574 {
575 tcp_retransmit_time(sk, all);
576 return;
577 }
578
579 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
580 /* sk->ssthresh in theory can be zero. I guess that's OK */
581 sk->cong_count = 0;
582
583 sk->cong_window = 1;
584
585 /* Do the actual retransmit. */
586 tcp_retransmit_time(sk, all);
587 }
588
589 /*
590 * A write timeout has occurred. Process the after effects.
591 */
592
593 static int tcp_write_timeout(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
594 {
595 /*
596 * Look for a 'soft' timeout.
597 */
598 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
599 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
600 {
601 /*
602 * Attempt to recover if arp has changed (unlikely!) or
603 * a route has shifted (not supported prior to 1.3).
604 */
605 arp_destroy (sk->daddr, 0);
606 ip_route_check (sk->daddr);
607 }
608 /*
609 * Has it gone just too far ?
610 */
611 if (sk->retransmits > TCP_RETR2)
612 {
613 sk->err = ETIMEDOUT;
614 sk->error_report(sk);
615 del_timer(&sk->retransmit_timer);
616 /*
617 * Time wait the socket
618 */
619 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
620 {
621 tcp_set_state(sk,TCP_TIME_WAIT);
622 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
623 }
624 else
625 {
626 /*
627 * Clean up time.
628 */
629 tcp_set_state(sk, TCP_CLOSE);
630 return 0;
631 }
632 }
633 return 1;
634 }
635
636 /*
637 * The TCP retransmit timer. This lacks a few small details.
638 *
639 * 1. An initial rtt timeout on the probe0 should cause what we can
640 * of the first write queue buffer to be split and sent.
641 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
642 * ETIMEDOUT if we know an additional 'soft' error caused this.
643 * tcp_err should save a 'soft error' for us.
644 */
645
646 static void retransmit_timer(unsigned long data)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
647 {
648 struct sock *sk = (struct sock*)data;
649 int why = sk->ip_xmit_timeout;
650
651 /*
652 * only process if socket is not in use
653 */
654
655 cli();
656 if (sk->inuse || in_bh)
657 {
658 /* Try again in 1 second */
659 sk->retransmit_timer.expires = HZ;
660 add_timer(&sk->retransmit_timer);
661 sti();
662 return;
663 }
664
665 sk->inuse = 1;
666 sti();
667
668 /* Always see if we need to send an ack. */
669
670 if (sk->ack_backlog && !sk->zapped)
671 {
672 sk->prot->read_wakeup (sk);
673 if (! sk->dead)
674 sk->data_ready(sk,0);
675 }
676
677 /* Now we need to figure out why the socket was on the timer. */
678
679 switch (why)
680 {
681 /* Window probing */
682 case TIME_PROBE0:
683 tcp_send_probe0(sk);
684 tcp_write_timeout(sk);
685 break;
686 /* Retransmitting */
687 case TIME_WRITE:
688 /* It could be we got here because we needed to send an ack.
689 * So we need to check for that.
690 */
691 {
692 struct sk_buff *skb;
693 unsigned long flags;
694
695 save_flags(flags);
696 cli();
697 skb = sk->send_head;
698 if (!skb)
699 {
700 restore_flags(flags);
701 }
702 else
703 {
704 /*
705 * Kicked by a delayed ack. Reset timer
706 * correctly now
707 */
708 if (jiffies < skb->when + sk->rto)
709 {
710 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
711 restore_flags(flags);
712 break;
713 }
714 restore_flags(flags);
715 /*
716 * Retransmission
717 */
718 sk->prot->retransmit (sk, 0);
719 tcp_write_timeout(sk);
720 }
721 break;
722 }
723 /* Sending Keepalives */
724 case TIME_KEEPOPEN:
725 /*
726 * this reset_timer() call is a hack, this is not
727 * how KEEPOPEN is supposed to work.
728 */
729 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
730
731 /* Send something to keep the connection open. */
732 if (sk->prot->write_wakeup)
733 sk->prot->write_wakeup (sk);
734 sk->retransmits++;
735 tcp_write_timeout(sk);
736 break;
737 default:
738 printk ("rexmit_timer: timer expired - reason unknown\n");
739 break;
740 }
741 release_sock(sk);
742 }
743
744 /*
745 * This routine is called by the ICMP module when it gets some
746 * sort of error condition. If err < 0 then the socket should
747 * be closed and the error returned to the user. If err > 0
748 * it's just the icmp type << 8 | icmp code. After adjustment
749 * header points to the first 8 bytes of the tcp header. We need
750 * to find the appropriate port.
751 */
752
753 void tcp_err(int err, unsigned char *header, unsigned long daddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
754 unsigned long saddr, struct inet_protocol *protocol)
755 {
756 struct tcphdr *th;
757 struct sock *sk;
758 struct iphdr *iph=(struct iphdr *)header;
759
760 header+=4*iph->ihl;
761
762
763 th =(struct tcphdr *)header;
764 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
765
766 if (sk == NULL)
767 return;
768
769 if(err<0)
770 {
771 sk->err = -err;
772 sk->error_report(sk);
773 return;
774 }
775
776 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
777 {
778 /*
779 * FIXME:
780 * For now we will just trigger a linear backoff.
781 * The slow start code should cause a real backoff here.
782 */
783 if (sk->cong_window > 4)
784 sk->cong_window--;
785 return;
786 }
787
788 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */
789
790 /*
791 * If we've already connected we will keep trying
792 * until we time out, or the user gives up.
793 */
794
795 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
796 {
797 if (sk->state == TCP_SYN_SENT)
798 {
799 tcp_statistics.TcpAttemptFails++;
800 tcp_set_state(sk,TCP_CLOSE);
801 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
802 }
803 sk->err = icmp_err_convert[err & 0xff].errno;
804 }
805 return;
806 }
807
808
809 /*
810 * Walk down the receive queue counting readable data until we hit the end or we find a gap
811 * in the received data queue (ie a frame missing that needs sending to us). Not
812 * sorting using two queues as data arrives makes life so much harder.
813 */
814
815 static int tcp_readable(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
816 {
817 unsigned long counted;
818 unsigned long amount;
819 struct sk_buff *skb;
820 int sum;
821 unsigned long flags;
822
823 if(sk && sk->debug)
824 printk("tcp_readable: %p - ",sk);
825
826 save_flags(flags);
827 cli();
828 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
829 {
830 restore_flags(flags);
831 if(sk && sk->debug)
832 printk("empty\n");
833 return(0);
834 }
835
836 counted = sk->copied_seq; /* Where we are at the moment */
837 amount = 0;
838
839 /*
840 * Do until a push or until we are out of data.
841 */
842
843 do
844 {
845 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */
846 break;
847 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */
848 if (skb->h.th->syn)
849 sum++;
850 if (sum > 0)
851 { /* Add it up, move on */
852 amount += sum;
853 if (skb->h.th->syn)
854 amount--;
855 counted += sum;
856 }
857 /*
858 * Don't count urg data ... but do it in the right place!
859 * Consider: "old_data (ptr is here) URG PUSH data"
860 * The old code would stop at the first push because
861 * it counted the urg (amount==1) and then does amount--
862 * *after* the loop. This means tcp_readable() always
863 * returned zero if any URG PUSH was in the queue, even
864 * though there was normal data available. If we subtract
865 * the urg data right here, we even get it to work for more
866 * than one URG PUSH skb without normal data.
867 * This means that select() finally works now with urg data
868 * in the queue. Note that rlogin was never affected
869 * because it doesn't use select(); it uses two processes
870 * and a blocking read(). And the queue scan in tcp_read()
871 * was correct. Mike <pall@rz.uni-karlsruhe.de>
872 */
873 if (skb->h.th->urg)
874 amount--; /* don't count urg data */
875 if (amount && skb->h.th->psh) break;
876 skb = skb->next;
877 }
878 while(skb != (struct sk_buff *)&sk->receive_queue);
879
880 restore_flags(flags);
881 if(sk->debug)
882 printk("got %lu bytes.\n",amount);
883 return(amount);
884 }
885
886 /*
887 * LISTEN is a special case for select..
888 */
889 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
890 {
891 if (sel_type == SEL_IN) {
892 int retval;
893
894 sk->inuse = 1;
895 retval = (tcp_find_established(sk) != NULL);
896 release_sock(sk);
897 if (!retval)
898 select_wait(&master_select_wakeup,wait);
899 return retval;
900 }
901 return 0;
902 }
903
904
905 /*
906 * Wait for a TCP event.
907 *
908 * Note that we don't need to set "sk->inuse", as the upper select layers
909 * take care of normal races (between the test and the event) and we don't
910 * go look at any of the socket buffers directly.
911 */
912 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
913 {
914 if (sk->state == TCP_LISTEN)
915 return tcp_listen_select(sk, sel_type, wait);
916
917 switch(sel_type) {
918 case SEL_IN:
919 if (sk->err)
920 return 1;
921 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
922 break;
923
924 if (sk->shutdown & RCV_SHUTDOWN)
925 return 1;
926
927 if (sk->acked_seq == sk->copied_seq)
928 break;
929
930 if (sk->urg_seq != sk->copied_seq ||
931 sk->acked_seq != sk->copied_seq+1 ||
932 sk->urginline || !sk->urg_data)
933 return 1;
934 break;
935
936 case SEL_OUT:
937 if (sk->shutdown & SEND_SHUTDOWN)
938 return 0;
939 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
940 break;
941 /*
942 * This is now right thanks to a small fix
943 * by Matt Dillon.
944 */
945
946 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
947 break;
948 return 1;
949
950 case SEL_EX:
951 if (sk->err || sk->urg_data)
952 return 1;
953 break;
954 }
955 select_wait(sk->sleep, wait);
956 return 0;
957 }
958
959 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
960 {
961 int err;
962 switch(cmd)
963 {
964
965 case TIOCINQ:
966 #ifdef FIXME /* FIXME: */
967 case FIONREAD:
968 #endif
969 {
970 unsigned long amount;
971
972 if (sk->state == TCP_LISTEN)
973 return(-EINVAL);
974
975 sk->inuse = 1;
976 amount = tcp_readable(sk);
977 release_sock(sk);
978 err=verify_area(VERIFY_WRITE,(void *)arg,
979 sizeof(unsigned long));
980 if(err)
981 return err;
982 put_fs_long(amount,(unsigned long *)arg);
983 return(0);
984 }
985 case SIOCATMARK:
986 {
987 int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
988
989 err = verify_area(VERIFY_WRITE,(void *) arg,
990 sizeof(unsigned long));
991 if (err)
992 return err;
993 put_fs_long(answ,(int *) arg);
994 return(0);
995 }
996 case TIOCOUTQ:
997 {
998 unsigned long amount;
999
1000 if (sk->state == TCP_LISTEN) return(-EINVAL);
1001 amount = sk->prot->wspace(sk);
1002 err=verify_area(VERIFY_WRITE,(void *)arg,
1003 sizeof(unsigned long));
1004 if(err)
1005 return err;
1006 put_fs_long(amount,(unsigned long *)arg);
1007 return(0);
1008 }
1009 default:
1010 return(-EINVAL);
1011 }
1012 }
1013
1014
1015 /*
1016 * This routine computes a TCP checksum.
1017 */
1018
1019 unsigned short tcp_check(struct tcphdr *th, int len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1020 unsigned long saddr, unsigned long daddr)
1021 {
1022 unsigned long sum;
1023
1024 if (saddr == 0) saddr = ip_my_addr();
1025
1026 /*
1027 * stupid, gcc complains when I use just one __asm__ block,
1028 * something about too many reloads, but this is just two
1029 * instructions longer than what I want
1030 */
1031 __asm__("
1032 addl %%ecx, %%ebx
1033 adcl %%edx, %%ebx
1034 adcl $0, %%ebx
1035 "
1036 : "=b"(sum)
1037 : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1038 : "bx", "cx", "dx" );
1039 __asm__("
1040 movl %%ecx, %%edx
1041 cld
1042 cmpl $32, %%ecx
1043 jb 2f
1044 shrl $5, %%ecx
1045 clc
1046 1: lodsl
1047 adcl %%eax, %%ebx
1048 lodsl
1049 adcl %%eax, %%ebx
1050 lodsl
1051 adcl %%eax, %%ebx
1052 lodsl
1053 adcl %%eax, %%ebx
1054 lodsl
1055 adcl %%eax, %%ebx
1056 lodsl
1057 adcl %%eax, %%ebx
1058 lodsl
1059 adcl %%eax, %%ebx
1060 lodsl
1061 adcl %%eax, %%ebx
1062 loop 1b
1063 adcl $0, %%ebx
1064 movl %%edx, %%ecx
1065 2: andl $28, %%ecx
1066 je 4f
1067 shrl $2, %%ecx
1068 clc
1069 3: lodsl
1070 adcl %%eax, %%ebx
1071 loop 3b
1072 adcl $0, %%ebx
1073 4: movl $0, %%eax
1074 testw $2, %%dx
1075 je 5f
1076 lodsw
1077 addl %%eax, %%ebx
1078 adcl $0, %%ebx
1079 movw $0, %%ax
1080 5: test $1, %%edx
1081 je 6f
1082 lodsb
1083 addl %%eax, %%ebx
1084 adcl $0, %%ebx
1085 6: movl %%ebx, %%eax
1086 shrl $16, %%eax
1087 addw %%ax, %%bx
1088 adcw $0, %%bx
1089 "
1090 : "=b"(sum)
1091 : "0"(sum), "c"(len), "S"(th)
1092 : "ax", "bx", "cx", "dx", "si" );
1093
1094 /* We only want the bottom 16 bits, but we never cleared the top 16. */
1095
1096 return((~sum) & 0xffff);
1097 }
1098
1099
1100
1101 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1102 unsigned long daddr, int len, struct sock *sk)
1103 {
1104 th->check = 0;
1105 th->check = tcp_check(th, len, saddr, daddr);
1106 return;
1107 }
1108
1109 /*
1110 * This is the main buffer sending routine. We queue the buffer
1111 * having checked it is sane seeming.
1112 */
1113
1114 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1115 {
1116 int size;
1117 struct tcphdr * th = skb->h.th;
1118
1119 /*
1120 * length of packet (not counting length of pre-tcp headers)
1121 */
1122
1123 size = skb->len - ((unsigned char *) th - skb->data);
1124
1125 /*
1126 * Sanity check it..
1127 */
1128
1129 if (size < sizeof(struct tcphdr) || size > skb->len)
1130 {
1131 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1132 skb, skb->data, th, skb->len);
1133 kfree_skb(skb, FREE_WRITE);
1134 return;
1135 }
1136
1137 /*
1138 * If we have queued a header size packet.. (these crash a few
1139 * tcp stacks if ack is not set)
1140 */
1141
1142 if (size == sizeof(struct tcphdr))
1143 {
1144 /* If it's got a syn or fin it's notionally included in the size..*/
1145 if(!th->syn && !th->fin)
1146 {
1147 printk("tcp_send_skb: attempt to queue a bogon.\n");
1148 kfree_skb(skb,FREE_WRITE);
1149 return;
1150 }
1151 }
1152
1153 /*
1154 * Actual processing.
1155 */
1156
1157 tcp_statistics.TcpOutSegs++;
1158 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1159
1160 /*
1161 * We must queue if
1162 *
1163 * a) The right edge of this frame exceeds the window
1164 * b) We are retransmitting (Nagle's rule)
1165 * c) We have too many packets 'in flight'
1166 */
1167
1168 if (after(skb->h.seq, sk->window_seq) ||
1169 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1170 sk->packets_out >= sk->cong_window)
1171 {
1172 /* checksum will be supplied by tcp_write_xmit. So
1173 * we shouldn't need to set it at all. I'm being paranoid */
1174 th->check = 0;
1175 if (skb->next != NULL)
1176 {
1177 printk("tcp_send_partial: next != NULL\n");
1178 skb_unlink(skb);
1179 }
1180 skb_queue_tail(&sk->write_queue, skb);
1181
1182 /*
1183 * If we don't fit we have to start the zero window
1184 * probes. This is broken - we really need to do a partial
1185 * send _first_ (This is what causes the Cisco and PC/TCP
1186 * grief).
1187 */
1188
1189 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1190 sk->send_head == NULL && sk->ack_backlog == 0)
1191 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1192 }
1193 else
1194 {
1195 /*
1196 * This is going straight out
1197 */
1198
1199 th->ack_seq = ntohl(sk->acked_seq);
1200 th->window = ntohs(tcp_select_window(sk));
1201
1202 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1203
1204 sk->sent_seq = sk->write_seq;
1205
1206 /*
1207 * This is mad. The tcp retransmit queue is put together
1208 * by the ip layer. This causes half the problems with
1209 * unroutable FIN's and other things.
1210 */
1211
1212 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1213
1214 /*
1215 * Set for next retransmit based on expected ACK time.
1216 * FIXME: We set this every time which means our
1217 * retransmits are really about a window behind.
1218 */
1219
1220 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1221 }
1222 }
1223
1224 /*
1225 * Locking problems lead us to a messy situation where we can have
1226 * multiple partially complete buffers queued up. This is really bad
1227 * as we don't want to be sending partial buffers. Fix this with
1228 * a semaphore or similar to lock tcp_write per socket.
1229 *
1230 * These routines are pretty self descriptive.
1231 */
1232
1233 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1234 {
1235 struct sk_buff * skb;
1236 unsigned long flags;
1237
1238 save_flags(flags);
1239 cli();
1240 skb = sk->partial;
1241 if (skb) {
1242 sk->partial = NULL;
1243 del_timer(&sk->partial_timer);
1244 }
1245 restore_flags(flags);
1246 return skb;
1247 }
1248
1249 /*
1250 * Empty the partial queue
1251 */
1252
1253 static void tcp_send_partial(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1254 {
1255 struct sk_buff *skb;
1256
1257 if (sk == NULL)
1258 return;
1259 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1260 tcp_send_skb(sk, skb);
1261 }
1262
1263 /*
1264 * Queue a partial frame
1265 */
1266
1267 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1268 {
1269 struct sk_buff * tmp;
1270 unsigned long flags;
1271
1272 save_flags(flags);
1273 cli();
1274 tmp = sk->partial;
1275 if (tmp)
1276 del_timer(&sk->partial_timer);
1277 sk->partial = skb;
1278 init_timer(&sk->partial_timer);
1279 /*
1280 * Wait up to 1 second for the buffer to fill.
1281 */
1282 sk->partial_timer.expires = HZ;
1283 sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1284 sk->partial_timer.data = (unsigned long) sk;
1285 add_timer(&sk->partial_timer);
1286 restore_flags(flags);
1287 if (tmp)
1288 tcp_send_skb(sk, tmp);
1289 }
1290
1291
1292 /*
1293 * This routine sends an ack and also updates the window.
1294 */
1295
1296 static void tcp_send_ack(unsigned long sequence, unsigned long ack,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1297 struct sock *sk,
1298 struct tcphdr *th, unsigned long daddr)
1299 {
1300 struct sk_buff *buff;
1301 struct tcphdr *t1;
1302 struct device *dev = NULL;
1303 int tmp;
1304
1305 if(sk->zapped)
1306 return; /* We have been reset, we may not send again */
1307
1308 /*
1309 * We need to grab some memory, and put together an ack,
1310 * and then put it into the queue to be sent.
1311 */
1312
1313 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1314 if (buff == NULL)
1315 {
1316 /*
1317 * Force it to send an ack. We don't have to do this
1318 * (ACK is unreliable) but it's much better use of
1319 * bandwidth on slow links to send a spare ack than
1320 * resend packets.
1321 */
1322
1323 sk->ack_backlog++;
1324 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1325 {
1326 reset_xmit_timer(sk, TIME_WRITE, HZ);
1327 }
1328 return;
1329 }
1330
1331 /*
1332 * Assemble a suitable TCP frame
1333 */
1334
1335 buff->len = sizeof(struct tcphdr);
1336 buff->sk = sk;
1337 buff->localroute = sk->localroute;
1338 t1 =(struct tcphdr *) buff->data;
1339
1340 /*
1341 * Put in the IP header and routing stuff.
1342 */
1343
1344 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1345 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1346 if (tmp < 0)
1347 {
1348 buff->free = 1;
1349 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1350 return;
1351 }
1352 buff->len += tmp;
1353 t1 =(struct tcphdr *)((char *)t1 +tmp);
1354
1355 memcpy(t1, th, sizeof(*t1));
1356
1357 /*
1358 * Swap the send and the receive.
1359 */
1360
1361 t1->dest = th->source;
1362 t1->source = th->dest;
1363 t1->seq = ntohl(sequence);
1364 t1->ack = 1;
1365 sk->window = tcp_select_window(sk);
1366 t1->window = ntohs(sk->window);
1367 t1->res1 = 0;
1368 t1->res2 = 0;
1369 t1->rst = 0;
1370 t1->urg = 0;
1371 t1->syn = 0;
1372 t1->psh = 0;
1373 t1->fin = 0;
1374
1375 /*
1376 * If we have nothing queued for transmit and the transmit timer
1377 * is on we are just doing an ACK timeout and need to switch
1378 * to a keepalive.
1379 */
1380
1381 if (ack == sk->acked_seq)
1382 {
1383 sk->ack_backlog = 0;
1384 sk->bytes_rcv = 0;
1385 sk->ack_timed = 0;
1386 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1387 && sk->ip_xmit_timeout == TIME_WRITE)
1388 {
1389 if(sk->keepopen) {
1390 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1391 } else {
1392 delete_timer(sk);
1393 }
1394 }
1395 }
1396
1397 /*
1398 * Fill in the packet and send it
1399 */
1400
1401 t1->ack_seq = ntohl(ack);
1402 t1->doff = sizeof(*t1)/4;
1403 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1404 if (sk->debug)
1405 printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1406 tcp_statistics.TcpOutSegs++;
1407 sk->prot->queue_xmit(sk, dev, buff, 1);
1408 }
1409
1410
1411 /*
1412 * This routine builds a generic TCP header.
1413 */
1414
1415 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1416 {
1417
1418 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1419 th->seq = htonl(sk->write_seq);
1420 th->psh =(push == 0) ? 1 : 0;
1421 th->doff = sizeof(*th)/4;
1422 th->ack = 1;
1423 th->fin = 0;
1424 sk->ack_backlog = 0;
1425 sk->bytes_rcv = 0;
1426 sk->ack_timed = 0;
1427 th->ack_seq = htonl(sk->acked_seq);
1428 sk->window = tcp_select_window(sk);
1429 th->window = htons(sk->window);
1430
1431 return(sizeof(*th));
1432 }
1433
1434 /*
1435 * This routine copies from a user buffer into a socket,
1436 * and starts the transmit system.
1437 */
1438
1439 static int tcp_write(struct sock *sk, unsigned char *from,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1440 int len, int nonblock, unsigned flags)
1441 {
1442 int copied = 0;
1443 int copy;
1444 int tmp;
1445 struct sk_buff *skb;
1446 struct sk_buff *send_tmp;
1447 unsigned char *buff;
1448 struct proto *prot;
1449 struct device *dev = NULL;
1450
1451 sk->inuse=1;
1452 prot = sk->prot;
1453 while(len > 0)
1454 {
1455 if (sk->err)
1456 { /* Stop on an error */
1457 release_sock(sk);
1458 if (copied)
1459 return(copied);
1460 tmp = -sk->err;
1461 sk->err = 0;
1462 return(tmp);
1463 }
1464
1465 /*
1466 * First thing we do is make sure that we are established.
1467 */
1468
1469 if (sk->shutdown & SEND_SHUTDOWN)
1470 {
1471 release_sock(sk);
1472 sk->err = EPIPE;
1473 if (copied)
1474 return(copied);
1475 sk->err = 0;
1476 return(-EPIPE);
1477 }
1478
1479 /*
1480 * Wait for a connection to finish.
1481 */
1482
1483 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1484 {
1485 if (sk->err)
1486 {
1487 release_sock(sk);
1488 if (copied)
1489 return(copied);
1490 tmp = -sk->err;
1491 sk->err = 0;
1492 return(tmp);
1493 }
1494
1495 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1496 {
1497 release_sock(sk);
1498 if (copied)
1499 return(copied);
1500
1501 if (sk->err)
1502 {
1503 tmp = -sk->err;
1504 sk->err = 0;
1505 return(tmp);
1506 }
1507
1508 if (sk->keepopen)
1509 {
1510 send_sig(SIGPIPE, current, 0);
1511 }
1512 return(-EPIPE);
1513 }
1514
1515 if (nonblock || copied)
1516 {
1517 release_sock(sk);
1518 if (copied)
1519 return(copied);
1520 return(-EAGAIN);
1521 }
1522
1523 release_sock(sk);
1524 cli();
1525
1526 if (sk->state != TCP_ESTABLISHED &&
1527 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1528 {
1529 interruptible_sleep_on(sk->sleep);
1530 if (current->signal & ~current->blocked)
1531 {
1532 sti();
1533 if (copied)
1534 return(copied);
1535 return(-ERESTARTSYS);
1536 }
1537 }
1538 sk->inuse = 1;
1539 sti();
1540 }
1541
1542 /*
1543 * The following code can result in copy <= if sk->mss is ever
1544 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
1545 * sk->mtu is constant once SYN processing is finished. I.e. we
1546 * had better not get here until we've seen his SYN and at least one
1547 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
1548 * But ESTABLISHED should guarantee that. sk->max_window is by definition
1549 * non-decreasing. Note that any ioctl to set user_mss must be done
1550 * before the exchange of SYN's. If the initial ack from the other
1551 * end has a window of 0, max_window and thus mss will both be 0.
1552 */
1553
1554 /*
1555 * Now we need to check if we have a half built packet.
1556 */
1557
1558 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1559 {
1560 int hdrlen;
1561
1562 /* IP header + TCP header */
1563 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1564 + sizeof(struct tcphdr);
1565
1566 /* Add more stuff to the end of skb->len */
1567 if (!(flags & MSG_OOB))
1568 {
1569 copy = min(sk->mss - (skb->len - hdrlen), len);
1570 /* FIXME: this is really a bug. */
1571 if (copy <= 0)
1572 {
1573 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1574 copy = 0;
1575 }
1576
1577 memcpy_fromfs(skb->data + skb->len, from, copy);
1578 skb->len += copy;
1579 from += copy;
1580 copied += copy;
1581 len -= copy;
1582 sk->write_seq += copy;
1583 }
1584 if ((skb->len - hdrlen) >= sk->mss ||
1585 (flags & MSG_OOB) || !sk->packets_out)
1586 tcp_send_skb(sk, skb);
1587 else
1588 tcp_enqueue_partial(skb, sk);
1589 continue;
1590 }
1591
1592 /*
1593 * We also need to worry about the window.
1594 * If window < 1/2 the maximum window we've seen from this
1595 * host, don't use it. This is sender side
1596 * silly window prevention, as specified in RFC1122.
1597 * (Note that this is different than earlier versions of
1598 * SWS prevention, e.g. RFC813.). What we actually do is
1599 * use the whole MSS. Since the results in the right
1600 * edge of the packet being outside the window, it will
1601 * be queued for later rather than sent.
1602 */
1603
1604 copy = sk->window_seq - sk->write_seq;
1605 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1606 copy = sk->mss;
1607 if (copy > len)
1608 copy = len;
1609
1610 /*
1611 * We should really check the window here also.
1612 */
1613
1614 send_tmp = NULL;
1615 if (copy < sk->mss && !(flags & MSG_OOB))
1616 {
1617 /*
1618 * We will release the socket in case we sleep here.
1619 */
1620 release_sock(sk);
1621 /*
1622 * NB: following must be mtu, because mss can be increased.
1623 * mss is always <= mtu
1624 */
1625 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1626 sk->inuse = 1;
1627 send_tmp = skb;
1628 }
1629 else
1630 {
1631 /*
1632 * We will release the socket in case we sleep here.
1633 */
1634 release_sock(sk);
1635 skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1636 sk->inuse = 1;
1637 }
1638
1639 /*
1640 * If we didn't get any memory, we need to sleep.
1641 */
1642
1643 if (skb == NULL)
1644 {
1645 sk->socket->flags |= SO_NOSPACE;
1646 if (nonblock)
1647 {
1648 release_sock(sk);
1649 if (copied)
1650 return(copied);
1651 return(-EAGAIN);
1652 }
1653
1654 /*
1655 * FIXME: here is another race condition.
1656 */
1657
1658 tmp = sk->wmem_alloc;
1659 release_sock(sk);
1660 cli();
1661 /*
1662 * Again we will try to avoid it.
1663 */
1664 if (tmp <= sk->wmem_alloc &&
1665 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1666 && sk->err == 0)
1667 {
1668 sk->socket->flags &= ~SO_NOSPACE;
1669 interruptible_sleep_on(sk->sleep);
1670 if (current->signal & ~current->blocked)
1671 {
1672 sti();
1673 if (copied)
1674 return(copied);
1675 return(-ERESTARTSYS);
1676 }
1677 }
1678 sk->inuse = 1;
1679 sti();
1680 continue;
1681 }
1682
1683 skb->len = 0;
1684 skb->sk = sk;
1685 skb->free = 0;
1686 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1687
1688 buff = skb->data;
1689
1690 /*
1691 * FIXME: we need to optimize this.
1692 * Perhaps some hints here would be good.
1693 */
1694
1695 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1696 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1697 if (tmp < 0 )
1698 {
1699 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1700 release_sock(sk);
1701 if (copied)
1702 return(copied);
1703 return(tmp);
1704 }
1705 skb->len += tmp;
1706 skb->dev = dev;
1707 buff += tmp;
1708 skb->h.th =(struct tcphdr *) buff;
1709 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1710 if (tmp < 0)
1711 {
1712 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1713 release_sock(sk);
1714 if (copied)
1715 return(copied);
1716 return(tmp);
1717 }
1718
1719 if (flags & MSG_OOB)
1720 {
1721 ((struct tcphdr *)buff)->urg = 1;
1722 ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1723 }
1724 skb->len += tmp;
1725 memcpy_fromfs(buff+tmp, from, copy);
1726
1727 from += copy;
1728 copied += copy;
1729 len -= copy;
1730 skb->len += copy;
1731 skb->free = 0;
1732 sk->write_seq += copy;
1733
1734 if (send_tmp != NULL && sk->packets_out)
1735 {
1736 tcp_enqueue_partial(send_tmp, sk);
1737 continue;
1738 }
1739 tcp_send_skb(sk, skb);
1740 }
1741 sk->err = 0;
1742
1743 /*
1744 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1745 * interactive fast network servers. It's meant to be on and
1746 * it really improves the throughput though not the echo time
1747 * on my slow slip link - Alan
1748 */
1749
1750 /*
1751 * Avoid possible race on send_tmp - c/o Johannes Stille
1752 */
1753
1754 if(sk->partial && ((!sk->packets_out)
1755 /* If not nagling we can send on the before case too.. */
1756 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1757 ))
1758 tcp_send_partial(sk);
1759
1760 release_sock(sk);
1761 return(copied);
1762 }
1763
1764 /*
1765 * This is just a wrapper.
1766 */
1767
1768 static int tcp_sendto(struct sock *sk, unsigned char *from,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1769 int len, int nonblock, unsigned flags,
1770 struct sockaddr_in *addr, int addr_len)
1771 {
1772 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1773 return -EINVAL;
1774 if (sk->state == TCP_CLOSE)
1775 return -ENOTCONN;
1776 if (addr_len < sizeof(*addr))
1777 return -EINVAL;
1778 if (addr->sin_family && addr->sin_family != AF_INET)
1779 return -EINVAL;
1780 if (addr->sin_port != sk->dummy_th.dest)
1781 return -EISCONN;
1782 if (addr->sin_addr.s_addr != sk->daddr)
1783 return -EISCONN;
1784 return tcp_write(sk, from, len, nonblock, flags);
1785 }
1786
1787
1788 /*
1789 * Send an ack if one is backlogged at this point. Ought to merge
1790 * this with tcp_send_ack().
1791 */
1792
1793 static void tcp_read_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1794 {
1795 int tmp;
1796 struct device *dev = NULL;
1797 struct tcphdr *t1;
1798 struct sk_buff *buff;
1799
1800 if (!sk->ack_backlog)
1801 return;
1802
1803 /*
1804 * FIXME: we need to put code here to prevent this routine from
1805 * being called. Being called once in a while is ok, so only check
1806 * if this is the second time in a row.
1807 */
1808
1809 /*
1810 * We need to grab some memory, and put together an ack,
1811 * and then put it into the queue to be sent.
1812 */
1813
1814 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1815 if (buff == NULL)
1816 {
1817 /* Try again real soon. */
1818 reset_xmit_timer(sk, TIME_WRITE, HZ);
1819 return;
1820 }
1821
1822 buff->len = sizeof(struct tcphdr);
1823 buff->sk = sk;
1824 buff->localroute = sk->localroute;
1825
1826 /*
1827 * Put in the IP header and routing stuff.
1828 */
1829
1830 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1831 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1832 if (tmp < 0)
1833 {
1834 buff->free = 1;
1835 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1836 return;
1837 }
1838
1839 buff->len += tmp;
1840 t1 =(struct tcphdr *)(buff->data +tmp);
1841
1842 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1843 t1->seq = htonl(sk->sent_seq);
1844 t1->ack = 1;
1845 t1->res1 = 0;
1846 t1->res2 = 0;
1847 t1->rst = 0;
1848 t1->urg = 0;
1849 t1->syn = 0;
1850 t1->psh = 0;
1851 sk->ack_backlog = 0;
1852 sk->bytes_rcv = 0;
1853 sk->window = tcp_select_window(sk);
1854 t1->window = ntohs(sk->window);
1855 t1->ack_seq = ntohl(sk->acked_seq);
1856 t1->doff = sizeof(*t1)/4;
1857 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1858 sk->prot->queue_xmit(sk, dev, buff, 1);
1859 tcp_statistics.TcpOutSegs++;
1860 }
1861
1862
1863 /*
1864 * FIXME:
1865 * This routine frees used buffers.
1866 * It should consider sending an ACK to let the
1867 * other end know we now have a bigger window.
1868 */
1869
1870 static void cleanup_rbuf(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1871 {
1872 unsigned long flags;
1873 unsigned long left;
1874 struct sk_buff *skb;
1875 unsigned long rspace;
1876
1877 if(sk->debug)
1878 printk("cleaning rbuf for sk=%p\n", sk);
1879
1880 save_flags(flags);
1881 cli();
1882
1883 left = sk->prot->rspace(sk);
1884
1885 /*
1886 * We have to loop through all the buffer headers,
1887 * and try to free up all the space we can.
1888 */
1889
1890 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1891 {
1892 if (!skb->used || skb->users)
1893 break;
1894 skb_unlink(skb);
1895 skb->sk = sk;
1896 kfree_skb(skb, FREE_READ);
1897 }
1898
1899 restore_flags(flags);
1900
1901 /*
1902 * FIXME:
1903 * At this point we should send an ack if the difference
1904 * in the window, and the amount of space is bigger than
1905 * TCP_WINDOW_DIFF.
1906 */
1907
1908 if(sk->debug)
1909 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1910 left);
1911 if ((rspace=sk->prot->rspace(sk)) != left)
1912 {
1913 /*
1914 * This area has caused the most trouble. The current strategy
1915 * is to simply do nothing if the other end has room to send at
1916 * least 3 full packets, because the ack from those will auto-
1917 * matically update the window. If the other end doesn't think
1918 * we have much space left, but we have room for at least 1 more
1919 * complete packet than it thinks we do, we will send an ack
1920 * immediately. Otherwise we will wait up to .5 seconds in case
1921 * the user reads some more.
1922 */
1923 sk->ack_backlog++;
1924 /*
1925 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
1926 * if the other end is offering a window smaller than the agreed on MSS
1927 * (called sk->mtu here). In theory there's no connection between send
1928 * and receive, and so no reason to think that they're going to send
1929 * small packets. For the moment I'm using the hack of reducing the mss
1930 * only on the send side, so I'm putting mtu here.
1931 */
1932
1933 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1934 {
1935 /* Send an ack right now. */
1936 tcp_read_wakeup(sk);
1937 }
1938 else
1939 {
1940 /* Force it to send an ack soon. */
1941 int was_active = del_timer(&sk->retransmit_timer);
1942 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1943 {
1944 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1945 }
1946 else
1947 add_timer(&sk->retransmit_timer);
1948 }
1949 }
1950 }
1951
1952
1953 /*
1954 * Handle reading urgent data. BSD has very simple semantics for
1955 * this, no blocking and very strange errors 8)
1956 */
1957
1958 static int tcp_read_urg(struct sock * sk, int nonblock,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1959 unsigned char *to, int len, unsigned flags)
1960 {
1961 /*
1962 * No URG data to read
1963 */
1964 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1965 return -EINVAL; /* Yes this is right ! */
1966
1967 if (sk->err)
1968 {
1969 int tmp = -sk->err;
1970 sk->err = 0;
1971 return tmp;
1972 }
1973
1974 if (sk->state == TCP_CLOSE || sk->done)
1975 {
1976 if (!sk->done) {
1977 sk->done = 1;
1978 return 0;
1979 }
1980 return -ENOTCONN;
1981 }
1982
1983 if (sk->shutdown & RCV_SHUTDOWN)
1984 {
1985 sk->done = 1;
1986 return 0;
1987 }
1988 sk->inuse = 1;
1989 if (sk->urg_data & URG_VALID)
1990 {
1991 char c = sk->urg_data;
1992 if (!(flags & MSG_PEEK))
1993 sk->urg_data = URG_READ;
1994 put_fs_byte(c, to);
1995 release_sock(sk);
1996 return 1;
1997 }
1998 release_sock(sk);
1999
2000 /*
2001 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
2002 * the available implementations agree in this case:
2003 * this call should never block, independent of the
2004 * blocking state of the socket.
2005 * Mike <pall@rz.uni-karlsruhe.de>
2006 */
2007 return -EAGAIN;
2008 }
2009
2010
2011 /*
2012 * This routine copies from a sock struct into the user buffer.
2013 */
2014
2015 static int tcp_read(struct sock *sk, unsigned char *to,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2016 int len, int nonblock, unsigned flags)
2017 {
2018 struct wait_queue wait = { current, NULL };
2019 int copied = 0;
2020 unsigned long peek_seq;
2021 volatile unsigned long *seq; /* So gcc doesn't overoptimise */
2022 unsigned long used;
2023
2024 /*
2025 * This error should be checked.
2026 */
2027
2028 if (sk->state == TCP_LISTEN)
2029 return -ENOTCONN;
2030
2031 /*
2032 * Urgent data needs to be handled specially.
2033 */
2034
2035 if (flags & MSG_OOB)
2036 return tcp_read_urg(sk, nonblock, to, len, flags);
2037
2038 /*
2039 * Copying sequence to update. This is volatile to handle
2040 * the multi-reader case neatly (memcpy_to/fromfs might be
2041 * inline and thus not flush cached variables otherwise).
2042 */
2043
2044 peek_seq = sk->copied_seq;
2045 seq = &sk->copied_seq;
2046 if (flags & MSG_PEEK)
2047 seq = &peek_seq;
2048
2049 add_wait_queue(sk->sleep, &wait);
2050 sk->inuse = 1;
2051 while (len > 0)
2052 {
2053 struct sk_buff * skb;
2054 unsigned long offset;
2055
2056 /*
2057 * Are we at urgent data? Stop if we have read anything.
2058 */
2059
2060 if (copied && sk->urg_data && sk->urg_seq == *seq)
2061 break;
2062
2063 /*
2064 * Next get a buffer.
2065 */
2066
2067 current->state = TASK_INTERRUPTIBLE;
2068
2069 skb = skb_peek(&sk->receive_queue);
2070 do
2071 {
2072 if (!skb)
2073 break;
2074 if (before(*seq, skb->h.th->seq))
2075 break;
2076 offset = *seq - skb->h.th->seq;
2077 if (skb->h.th->syn)
2078 offset--;
2079 if (offset < skb->len)
2080 goto found_ok_skb;
2081 if (skb->h.th->fin)
2082 goto found_fin_ok;
2083 if (!(flags & MSG_PEEK))
2084 skb->used = 1;
2085 skb = skb->next;
2086 }
2087 while (skb != (struct sk_buff *)&sk->receive_queue);
2088
2089 if (copied)
2090 break;
2091
2092 if (sk->err)
2093 {
2094 copied = -sk->err;
2095 sk->err = 0;
2096 break;
2097 }
2098
2099 if (sk->state == TCP_CLOSE)
2100 {
2101 if (!sk->done)
2102 {
2103 sk->done = 1;
2104 break;
2105 }
2106 copied = -ENOTCONN;
2107 break;
2108 }
2109
2110 if (sk->shutdown & RCV_SHUTDOWN)
2111 {
2112 sk->done = 1;
2113 break;
2114 }
2115
2116 if (nonblock)
2117 {
2118 copied = -EAGAIN;
2119 break;
2120 }
2121
2122 cleanup_rbuf(sk);
2123 release_sock(sk);
2124 sk->socket->flags |= SO_WAITDATA;
2125 schedule();
2126 sk->socket->flags &= ~SO_WAITDATA;
2127 sk->inuse = 1;
2128
2129 if (current->signal & ~current->blocked)
2130 {
2131 copied = -ERESTARTSYS;
2132 break;
2133 }
2134 continue;
2135
2136 found_ok_skb:
2137 /*
2138 * Lock the buffer. We can be fairly relaxed as
2139 * an interrupt will never steal a buffer we are
2140 * using unless I've missed something serious in
2141 * tcp_data.
2142 */
2143
2144 skb->users++;
2145
2146 /*
2147 * Ok so how much can we use ?
2148 */
2149
2150 used = skb->len - offset;
2151 if (len < used)
2152 used = len;
2153 /*
2154 * Do we have urgent data here?
2155 */
2156
2157 if (sk->urg_data)
2158 {
2159 unsigned long urg_offset = sk->urg_seq - *seq;
2160 if (urg_offset < used)
2161 {
2162 if (!urg_offset)
2163 {
2164 if (!sk->urginline)
2165 {
2166 ++*seq;
2167 offset++;
2168 used--;
2169 }
2170 }
2171 else
2172 used = urg_offset;
2173 }
2174 }
2175
2176 /*
2177 * Copy it - We _MUST_ update *seq first so that we
2178 * don't ever double read when we have dual readers
2179 */
2180
2181 *seq += used;
2182
2183 /*
2184 * This memcpy_tofs can sleep. If it sleeps and we
2185 * do a second read it relies on the skb->users to avoid
2186 * a crash when cleanup_rbuf() gets called.
2187 */
2188
2189 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2190 skb->h.th->doff*4 + offset, used);
2191 copied += used;
2192 len -= used;
2193 to += used;
2194
2195 /*
2196 * We now will not sleep again until we are finished
2197 * with skb. Sorry if you are doing the SMP port
2198 * but you'll just have to fix it neatly ;)
2199 */
2200
2201 skb->users --;
2202
2203 if (after(sk->copied_seq,sk->urg_seq))
2204 sk->urg_data = 0;
2205 if (used + offset < skb->len)
2206 continue;
2207
2208 /*
2209 * Process the FIN.
2210 */
2211
2212 if (skb->h.th->fin)
2213 goto found_fin_ok;
2214 if (flags & MSG_PEEK)
2215 continue;
2216 skb->used = 1;
2217 continue;
2218
2219 found_fin_ok:
2220 ++*seq;
2221 if (flags & MSG_PEEK)
2222 break;
2223
2224 /*
2225 * All is done
2226 */
2227
2228 skb->used = 1;
2229 sk->shutdown |= RCV_SHUTDOWN;
2230 break;
2231
2232 }
2233 remove_wait_queue(sk->sleep, &wait);
2234 current->state = TASK_RUNNING;
2235
2236 /* Clean up data we have read: This will do ACK frames */
2237 cleanup_rbuf(sk);
2238 release_sock(sk);
2239 return copied;
2240 }
2241
2242 /*
2243 * State processing on a close. This implements the state shift for
2244 * sending our FIN frame. Note that we only send a FIN for some
2245 * states. A shutdown() may have already sent the FIN, or we may be
2246 * closed.
2247 */
2248
2249 static int tcp_close_state(struct sock *sk, int dead)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2250 {
2251 int ns=TCP_CLOSE;
2252 int send_fin=0;
2253 switch(sk->state)
2254 {
2255 case TCP_SYN_SENT: /* No SYN back, no FIN needed */
2256 break;
2257 case TCP_SYN_RECV:
2258 case TCP_ESTABLISHED: /* Closedown begin */
2259 ns=TCP_FIN_WAIT1;
2260 send_fin=1;
2261 break;
2262 case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */
2263 case TCP_FIN_WAIT2:
2264 case TCP_CLOSING:
2265 ns=sk->state;
2266 break;
2267 case TCP_CLOSE:
2268 case TCP_LISTEN:
2269 break;
2270 case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
2271 wait only for the ACK */
2272 ns=TCP_LAST_ACK;
2273 send_fin=1;
2274 }
2275
2276 tcp_set_state(sk,ns);
2277
2278 /*
2279 * This is a (useful) BSD violating of the RFC. There is a
2280 * problem with TCP as specified in that the other end could
2281 * keep a socket open forever with no application left this end.
2282 * We use a 3 minute timeout (about the same as BSD) then kill
2283 * our end. If they send after that then tough - BUT: long enough
2284 * that we won't make the old 4*rto = almost no time - whoops
2285 * reset mistake.
2286 */
2287 if(dead && ns==TCP_FIN_WAIT2)
2288 {
2289 int timer_active=del_timer(&sk->timer);
2290 if(timer_active)
2291 add_timer(&sk->timer);
2292 else
2293 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2294 }
2295
2296 return send_fin;
2297 }
2298
2299 /*
2300 * Send a fin.
2301 */
2302
2303 static void tcp_send_fin(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2304 {
2305 struct proto *prot =(struct proto *)sk->prot;
2306 struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2307 struct tcphdr *t1;
2308 struct sk_buff *buff;
2309 struct device *dev=NULL;
2310 int tmp;
2311
2312 release_sock(sk); /* in case the malloc sleeps. */
2313
2314 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2315 sk->inuse = 1;
2316
2317 if (buff == NULL)
2318 {
2319 /* This is a disaster if it occurs */
2320 printk("tcp_send_fin: Impossible malloc failure");
2321 return;
2322 }
2323
2324 /*
2325 * Administrivia
2326 */
2327
2328 buff->sk = sk;
2329 buff->len = sizeof(*t1);
2330 buff->localroute = sk->localroute;
2331 t1 =(struct tcphdr *) buff->data;
2332
2333 /*
2334 * Put in the IP header and routing stuff.
2335 */
2336
2337 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2338 IPPROTO_TCP, sk->opt,
2339 sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2340 if (tmp < 0)
2341 {
2342 int t;
2343 /*
2344 * Finish anyway, treat this as a send that got lost.
2345 * (Not good).
2346 */
2347
2348 buff->free = 1;
2349 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2350 sk->write_seq++;
2351 t=del_timer(&sk->timer);
2352 if(t)
2353 add_timer(&sk->timer);
2354 else
2355 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2356 return;
2357 }
2358
2359 /*
2360 * We ought to check if the end of the queue is a buffer and
2361 * if so simply add the fin to that buffer, not send it ahead.
2362 */
2363
2364 t1 =(struct tcphdr *)((char *)t1 +tmp);
2365 buff->len += tmp;
2366 buff->dev = dev;
2367 memcpy(t1, th, sizeof(*t1));
2368 t1->seq = ntohl(sk->write_seq);
2369 sk->write_seq++;
2370 buff->h.seq = sk->write_seq;
2371 t1->ack = 1;
2372 t1->ack_seq = ntohl(sk->acked_seq);
2373 t1->window = ntohs(sk->window=tcp_select_window(sk));
2374 t1->fin = 1;
2375 t1->rst = 0;
2376 t1->doff = sizeof(*t1)/4;
2377 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2378
2379 /*
2380 * If there is data in the write queue, the fin must be appended to
2381 * the write queue.
2382 */
2383
2384 if (skb_peek(&sk->write_queue) != NULL)
2385 {
2386 buff->free = 0;
2387 if (buff->next != NULL)
2388 {
2389 printk("tcp_send_fin: next != NULL\n");
2390 skb_unlink(buff);
2391 }
2392 skb_queue_tail(&sk->write_queue, buff);
2393 }
2394 else
2395 {
2396 sk->sent_seq = sk->write_seq;
2397 sk->prot->queue_xmit(sk, dev, buff, 0);
2398 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2399 }
2400 }
2401
2402 /*
2403 * Shutdown the sending side of a connection. Much like close except
2404 * that we don't receive shut down or set sk->dead=1.
2405 */
2406
2407 void tcp_shutdown(struct sock *sk, int how)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2408 {
2409 /*
2410 * We need to grab some memory, and put together a FIN,
2411 * and then put it into the queue to be sent.
2412 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2413 */
2414
2415 if (!(how & SEND_SHUTDOWN))
2416 return;
2417
2418 /*
2419 * If we've already sent a FIN, or it's a closed state
2420 */
2421
2422 if (sk->state == TCP_FIN_WAIT1 ||
2423 sk->state == TCP_FIN_WAIT2 ||
2424 sk->state == TCP_CLOSING ||
2425 sk->state == TCP_LAST_ACK ||
2426 sk->state == TCP_TIME_WAIT ||
2427 sk->state == TCP_CLOSE ||
2428 sk->state == TCP_LISTEN
2429 )
2430 {
2431 return;
2432 }
2433 sk->inuse = 1;
2434
2435 /*
2436 * flag that the sender has shutdown
2437 */
2438
2439 sk->shutdown |= SEND_SHUTDOWN;
2440
2441 /*
2442 * Clear out any half completed packets.
2443 */
2444
2445 if (sk->partial)
2446 tcp_send_partial(sk);
2447
2448 /*
2449 * FIN if needed
2450 */
2451
2452 if(tcp_close_state(sk,0))
2453 tcp_send_fin(sk);
2454
2455 release_sock(sk);
2456 }
2457
2458
2459 static int
2460 tcp_recvfrom(struct sock *sk, unsigned char *to,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2461 int to_len, int nonblock, unsigned flags,
2462 struct sockaddr_in *addr, int *addr_len)
2463 {
2464 int result;
2465
2466 /*
2467 * Have to check these first unlike the old code. If
2468 * we check them after we lose data on an error
2469 * which is wrong
2470 */
2471
2472 if(addr_len)
2473 *addr_len = sizeof(*addr);
2474 result=tcp_read(sk, to, to_len, nonblock, flags);
2475
2476 if (result < 0)
2477 return(result);
2478
2479 if(addr)
2480 {
2481 addr->sin_family = AF_INET;
2482 addr->sin_port = sk->dummy_th.dest;
2483 addr->sin_addr.s_addr = sk->daddr;
2484 }
2485 return(result);
2486 }
2487
2488
2489 /*
2490 * This routine will send an RST to the other tcp.
2491 */
2492
2493 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2494 struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2495 {
2496 struct sk_buff *buff;
2497 struct tcphdr *t1;
2498 int tmp;
2499 struct device *ndev=NULL;
2500
2501 /*
2502 * Cannot reset a reset (Think about it).
2503 */
2504
2505 if(th->rst)
2506 return;
2507
2508 /*
2509 * We need to grab some memory, and put together an RST,
2510 * and then put it into the queue to be sent.
2511 */
2512
2513 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2514 if (buff == NULL)
2515 return;
2516
2517 buff->len = sizeof(*t1);
2518 buff->sk = NULL;
2519 buff->dev = dev;
2520 buff->localroute = 0;
2521
2522 t1 =(struct tcphdr *) buff->data;
2523
2524 /*
2525 * Put in the IP header and routing stuff.
2526 */
2527
2528 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2529 sizeof(struct tcphdr),tos,ttl);
2530 if (tmp < 0)
2531 {
2532 buff->free = 1;
2533 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2534 return;
2535 }
2536
2537 t1 =(struct tcphdr *)((char *)t1 +tmp);
2538 buff->len += tmp;
2539 memcpy(t1, th, sizeof(*t1));
2540
2541 /*
2542 * Swap the send and the receive.
2543 */
2544
2545 t1->dest = th->source;
2546 t1->source = th->dest;
2547 t1->rst = 1;
2548 t1->window = 0;
2549
2550 if(th->ack)
2551 {
2552 t1->ack = 0;
2553 t1->seq = th->ack_seq;
2554 t1->ack_seq = 0;
2555 }
2556 else
2557 {
2558 t1->ack = 1;
2559 if(!th->syn)
2560 t1->ack_seq=htonl(th->seq);
2561 else
2562 t1->ack_seq=htonl(th->seq+1);
2563 t1->seq=0;
2564 }
2565
2566 t1->syn = 0;
2567 t1->urg = 0;
2568 t1->fin = 0;
2569 t1->psh = 0;
2570 t1->doff = sizeof(*t1)/4;
2571 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2572 prot->queue_xmit(NULL, ndev, buff, 1);
2573 tcp_statistics.TcpOutSegs++;
2574 }
2575
2576
2577 /*
2578 * Look for tcp options. Parses everything but only knows about MSS.
2579 * This routine is always called with the packet containing the SYN.
2580 * However it may also be called with the ack to the SYN. So you
2581 * can't assume this is always the SYN. It's always called after
2582 * we have set up sk->mtu to our own MTU.
2583 *
2584 * We need at minimum to add PAWS support here. Possibly large windows
2585 * as Linux gets deployed on 100Mb/sec networks.
2586 */
2587
2588 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2589 {
2590 unsigned char *ptr;
2591 int length=(th->doff*4)-sizeof(struct tcphdr);
2592 int mss_seen = 0;
2593
2594 ptr = (unsigned char *)(th + 1);
2595
2596 while(length>0)
2597 {
2598 int opcode=*ptr++;
2599 int opsize=*ptr++;
2600 switch(opcode)
2601 {
2602 case TCPOPT_EOL:
2603 return;
2604 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
2605 length--;
2606 ptr--; /* the opsize=*ptr++ above was a mistake */
2607 continue;
2608
2609 default:
2610 if(opsize<=2) /* Avoid silly options looping forever */
2611 return;
2612 switch(opcode)
2613 {
2614 case TCPOPT_MSS:
2615 if(opsize==4 && th->syn)
2616 {
2617 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2618 mss_seen = 1;
2619 }
2620 break;
2621 /* Add other options here as people feel the urge to implement stuff like large windows */
2622 }
2623 ptr+=opsize-2;
2624 length-=opsize;
2625 }
2626 }
2627 if (th->syn)
2628 {
2629 if (! mss_seen)
2630 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
2631 }
2632 #ifdef CONFIG_INET_PCTCP
2633 sk->mss = min(sk->max_window >> 1, sk->mtu);
2634 #else
2635 sk->mss = min(sk->max_window, sk->mtu);
2636 #endif
2637 }
2638
2639 static inline unsigned long default_mask(unsigned long dst)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2640 {
2641 dst = ntohl(dst);
2642 if (IN_CLASSA(dst))
2643 return htonl(IN_CLASSA_NET);
2644 if (IN_CLASSB(dst))
2645 return htonl(IN_CLASSB_NET);
2646 return htonl(IN_CLASSC_NET);
2647 }
2648
2649 /*
2650 * Default sequence number picking algorithm.
2651 * As close as possible to RFC 793, which
2652 * suggests using a 250kHz clock.
2653 * Further reading shows this assumes 2MB/s networks.
2654 * For 10MB/s ethernet, a 1MHz clock is appropriate.
2655 * That's funny, Linux has one built in! Use it!
2656 */
2657
2658 extern inline unsigned long tcp_init_seq(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2659 {
2660 struct timeval tv;
2661 do_gettimeofday(&tv);
2662 return tv.tv_usec+tv.tv_sec*1000000;
2663 }
2664
2665 /*
2666 * This routine handles a connection request.
2667 * It should make sure we haven't already responded.
2668 * Because of the way BSD works, we have to send a syn/ack now.
2669 * This also means it will be harder to close a socket which is
2670 * listening.
2671 */
2672
2673 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2674 unsigned long daddr, unsigned long saddr,
2675 struct options *opt, struct device *dev, unsigned long seq)
2676 {
2677 struct sk_buff *buff;
2678 struct tcphdr *t1;
2679 unsigned char *ptr;
2680 struct sock *newsk;
2681 struct tcphdr *th;
2682 struct device *ndev=NULL;
2683 int tmp;
2684 struct rtable *rt;
2685
2686 th = skb->h.th;
2687
2688 /* If the socket is dead, don't accept the connection. */
2689 if (!sk->dead)
2690 {
2691 sk->data_ready(sk,0);
2692 }
2693 else
2694 {
2695 if(sk->debug)
2696 printk("Reset on %p: Connect on dead socket.\n",sk);
2697 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2698 tcp_statistics.TcpAttemptFails++;
2699 kfree_skb(skb, FREE_READ);
2700 return;
2701 }
2702
2703 /*
2704 * Make sure we can accept more. This will prevent a
2705 * flurry of syns from eating up all our memory.
2706 */
2707
2708 if (sk->ack_backlog >= sk->max_ack_backlog)
2709 {
2710 tcp_statistics.TcpAttemptFails++;
2711 kfree_skb(skb, FREE_READ);
2712 return;
2713 }
2714
2715 /*
2716 * We need to build a new sock struct.
2717 * It is sort of bad to have a socket without an inode attached
2718 * to it, but the wake_up's will just wake up the listening socket,
2719 * and if the listening socket is destroyed before this is taken
2720 * off of the queue, this will take care of it.
2721 */
2722
2723 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2724 if (newsk == NULL)
2725 {
2726 /* just ignore the syn. It will get retransmitted. */
2727 tcp_statistics.TcpAttemptFails++;
2728 kfree_skb(skb, FREE_READ);
2729 return;
2730 }
2731
2732 memcpy(newsk, sk, sizeof(*newsk));
2733 skb_queue_head_init(&newsk->write_queue);
2734 skb_queue_head_init(&newsk->receive_queue);
2735 newsk->send_head = NULL;
2736 newsk->send_tail = NULL;
2737 skb_queue_head_init(&newsk->back_log);
2738 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
2739 newsk->rto = TCP_TIMEOUT_INIT;
2740 newsk->mdev = 0;
2741 newsk->max_window = 0;
2742 newsk->cong_window = 1;
2743 newsk->cong_count = 0;
2744 newsk->ssthresh = 0;
2745 newsk->backoff = 0;
2746 newsk->blog = 0;
2747 newsk->intr = 0;
2748 newsk->proc = 0;
2749 newsk->done = 0;
2750 newsk->partial = NULL;
2751 newsk->pair = NULL;
2752 newsk->wmem_alloc = 0;
2753 newsk->rmem_alloc = 0;
2754 newsk->localroute = sk->localroute;
2755
2756 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2757
2758 newsk->err = 0;
2759 newsk->shutdown = 0;
2760 newsk->ack_backlog = 0;
2761 newsk->acked_seq = skb->h.th->seq+1;
2762 newsk->copied_seq = skb->h.th->seq+1;
2763 newsk->fin_seq = skb->h.th->seq;
2764 newsk->state = TCP_SYN_RECV;
2765 newsk->timeout = 0;
2766 newsk->ip_xmit_timeout = 0;
2767 newsk->write_seq = seq;
2768 newsk->window_seq = newsk->write_seq;
2769 newsk->rcv_ack_seq = newsk->write_seq;
2770 newsk->urg_data = 0;
2771 newsk->retransmits = 0;
2772 newsk->linger=0;
2773 newsk->destroy = 0;
2774 init_timer(&newsk->timer);
2775 newsk->timer.data = (unsigned long)newsk;
2776 newsk->timer.function = &net_timer;
2777 init_timer(&newsk->retransmit_timer);
2778 newsk->retransmit_timer.data = (unsigned long)newsk;
2779 newsk->retransmit_timer.function=&retransmit_timer;
2780 newsk->dummy_th.source = skb->h.th->dest;
2781 newsk->dummy_th.dest = skb->h.th->source;
2782
2783 /*
2784 * Swap these two, they are from our point of view.
2785 */
2786
2787 newsk->daddr = saddr;
2788 newsk->saddr = daddr;
2789
2790 put_sock(newsk->num,newsk);
2791 newsk->dummy_th.res1 = 0;
2792 newsk->dummy_th.doff = 6;
2793 newsk->dummy_th.fin = 0;
2794 newsk->dummy_th.syn = 0;
2795 newsk->dummy_th.rst = 0;
2796 newsk->dummy_th.psh = 0;
2797 newsk->dummy_th.ack = 0;
2798 newsk->dummy_th.urg = 0;
2799 newsk->dummy_th.res2 = 0;
2800 newsk->acked_seq = skb->h.th->seq + 1;
2801 newsk->copied_seq = skb->h.th->seq + 1;
2802 newsk->socket = NULL;
2803
2804 /*
2805 * Grab the ttl and tos values and use them
2806 */
2807
2808 newsk->ip_ttl=sk->ip_ttl;
2809 newsk->ip_tos=skb->ip_hdr->tos;
2810
2811 /*
2812 * Use 512 or whatever user asked for
2813 */
2814
2815 /*
2816 * Note use of sk->user_mss, since user has no direct access to newsk
2817 */
2818
2819 rt=ip_rt_route(saddr, NULL,NULL);
2820
2821 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2822 newsk->window_clamp = rt->rt_window;
2823 else
2824 newsk->window_clamp = 0;
2825
2826 if (sk->user_mss)
2827 newsk->mtu = sk->user_mss;
2828 else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2829 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2830 else
2831 {
2832 #ifdef CONFIG_INET_SNARL /* Sub Nets Are Local */
2833 if ((saddr ^ daddr) & default_mask(saddr))
2834 #else
2835 if ((saddr ^ daddr) & dev->pa_mask)
2836 #endif
2837 newsk->mtu = 576 - HEADER_SIZE;
2838 else
2839 newsk->mtu = MAX_WINDOW;
2840 }
2841
2842 /*
2843 * But not bigger than device MTU
2844 */
2845
2846 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2847
2848 /*
2849 * This will min with what arrived in the packet
2850 */
2851
2852 tcp_options(newsk,skb->h.th);
2853
2854 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2855 if (buff == NULL)
2856 {
2857 sk->err = ENOMEM;
2858 newsk->dead = 1;
2859 newsk->state = TCP_CLOSE;
2860 /* And this will destroy it */
2861 release_sock(newsk);
2862 kfree_skb(skb, FREE_READ);
2863 tcp_statistics.TcpAttemptFails++;
2864 return;
2865 }
2866
2867 buff->len = sizeof(struct tcphdr)+4;
2868 buff->sk = newsk;
2869 buff->localroute = newsk->localroute;
2870
2871 t1 =(struct tcphdr *) buff->data;
2872
2873 /*
2874 * Put in the IP header and routing stuff.
2875 */
2876
2877 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2878 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2879
2880 /*
2881 * Something went wrong.
2882 */
2883
2884 if (tmp < 0)
2885 {
2886 sk->err = tmp;
2887 buff->free = 1;
2888 kfree_skb(buff,FREE_WRITE);
2889 newsk->dead = 1;
2890 newsk->state = TCP_CLOSE;
2891 release_sock(newsk);
2892 skb->sk = sk;
2893 kfree_skb(skb, FREE_READ);
2894 tcp_statistics.TcpAttemptFails++;
2895 return;
2896 }
2897
2898 buff->len += tmp;
2899 t1 =(struct tcphdr *)((char *)t1 +tmp);
2900
2901 memcpy(t1, skb->h.th, sizeof(*t1));
2902 buff->h.seq = newsk->write_seq;
2903 /*
2904 * Swap the send and the receive.
2905 */
2906 t1->dest = skb->h.th->source;
2907 t1->source = newsk->dummy_th.source;
2908 t1->seq = ntohl(newsk->write_seq++);
2909 t1->ack = 1;
2910 newsk->window = tcp_select_window(newsk);
2911 newsk->sent_seq = newsk->write_seq;
2912 t1->window = ntohs(newsk->window);
2913 t1->res1 = 0;
2914 t1->res2 = 0;
2915 t1->rst = 0;
2916 t1->urg = 0;
2917 t1->psh = 0;
2918 t1->syn = 1;
2919 t1->ack_seq = ntohl(skb->h.th->seq+1);
2920 t1->doff = sizeof(*t1)/4+1;
2921 ptr =(unsigned char *)(t1+1);
2922 ptr[0] = 2;
2923 ptr[1] = 4;
2924 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2925 ptr[3] =(newsk->mtu) & 0xff;
2926
2927 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2928 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2929 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2930 skb->sk = newsk;
2931
2932 /*
2933 * Charge the sock_buff to newsk.
2934 */
2935
2936 sk->rmem_alloc -= skb->mem_len;
2937 newsk->rmem_alloc += skb->mem_len;
2938
2939 skb_queue_tail(&sk->receive_queue,skb);
2940 sk->ack_backlog++;
2941 release_sock(newsk);
2942 tcp_statistics.TcpOutSegs++;
2943 }
2944
2945
2946 static void tcp_close(struct sock *sk, int timeout)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2947 {
2948 /*
2949 * We need to grab some memory, and put together a FIN,
2950 * and then put it into the queue to be sent.
2951 */
2952
2953 sk->inuse = 1;
2954
2955 if(sk->state == TCP_LISTEN)
2956 {
2957 /* Special case */
2958 tcp_set_state(sk, TCP_CLOSE);
2959 tcp_close_pending(sk);
2960 release_sock(sk);
2961 return;
2962 }
2963
2964 sk->keepopen = 1;
2965 sk->shutdown = SHUTDOWN_MASK;
2966
2967 if (!sk->dead)
2968 sk->state_change(sk);
2969
2970 if (timeout == 0)
2971 {
2972 struct sk_buff *skb;
2973
2974 /*
2975 * We need to flush the recv. buffs. We do this only on the
2976 * descriptor close, not protocol-sourced closes, because the
2977 * reader process may not have drained the data yet!
2978 */
2979
2980 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2981 kfree_skb(skb, FREE_READ);
2982 /*
2983 * Get rid off any half-completed packets.
2984 */
2985
2986 if (sk->partial)
2987 tcp_send_partial(sk);
2988 }
2989
2990
2991 /*
2992 * Timeout is not the same thing - however the code likes
2993 * to send both the same way (sigh).
2994 */
2995
2996 if(timeout)
2997 {
2998 tcp_set_state(sk, TCP_CLOSE); /* Dead */
2999 }
3000 else
3001 {
3002 if(tcp_close_state(sk,1)==1)
3003 {
3004 tcp_send_fin(sk);
3005 }
3006 }
3007 release_sock(sk);
3008 }
3009
3010
3011 /*
3012 * This routine takes stuff off of the write queue,
3013 * and puts it in the xmit queue. This happens as incoming acks
3014 * open up the remote window for us.
3015 */
3016
3017 static void tcp_write_xmit(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3018 {
3019 struct sk_buff *skb;
3020
3021 /*
3022 * The bytes will have to remain here. In time closedown will
3023 * empty the write queue and all will be happy
3024 */
3025
3026 if(sk->zapped)
3027 return;
3028
3029 /*
3030 * Anything on the transmit queue that fits the window can
3031 * be added providing we are not
3032 *
3033 * a) retransmitting (Nagle's rule)
3034 * b) exceeding our congestion window.
3035 */
3036
3037 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3038 before(skb->h.seq, sk->window_seq + 1) &&
3039 (sk->retransmits == 0 ||
3040 sk->ip_xmit_timeout != TIME_WRITE ||
3041 before(skb->h.seq, sk->rcv_ack_seq + 1))
3042 && sk->packets_out < sk->cong_window)
3043 {
3044 IS_SKB(skb);
3045 skb_unlink(skb);
3046
3047 /*
3048 * See if we really need to send the packet.
3049 */
3050
3051 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3052 {
3053 /*
3054 * This is acked data. We can discard it. This
3055 * cannot currently occur.
3056 */
3057
3058 sk->retransmits = 0;
3059 kfree_skb(skb, FREE_WRITE);
3060 if (!sk->dead)
3061 sk->write_space(sk);
3062 }
3063 else
3064 {
3065 struct tcphdr *th;
3066 struct iphdr *iph;
3067 int size;
3068 /*
3069 * put in the ack seq and window at this point rather than earlier,
3070 * in order to keep them monotonic. We really want to avoid taking
3071 * back window allocations. That's legal, but RFC1122 says it's frowned on.
3072 * Ack and window will in general have changed since this packet was put
3073 * on the write queue.
3074 */
3075 iph = (struct iphdr *)(skb->data +
3076 skb->dev->hard_header_len);
3077 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3078 size = skb->len - (((unsigned char *) th) - skb->data);
3079
3080 th->ack_seq = ntohl(sk->acked_seq);
3081 th->window = ntohs(tcp_select_window(sk));
3082
3083 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3084
3085 sk->sent_seq = skb->h.seq;
3086
3087 /*
3088 * IP manages our queue for some crazy reason
3089 */
3090
3091 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3092
3093 /*
3094 * Again we slide the timer wrongly
3095 */
3096
3097 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3098 }
3099 }
3100 }
3101
3102
3103 /*
3104 * This routine deals with incoming acks, but not outgoing ones.
3105 */
3106
3107 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3108 {
3109 unsigned long ack;
3110 int flag = 0;
3111
3112 /*
3113 * 1 - there was data in packet as well as ack or new data is sent or
3114 * in shutdown state
3115 * 2 - data from retransmit queue was acked and removed
3116 * 4 - window shrunk or data from retransmit queue was acked and removed
3117 */
3118
3119 if(sk->zapped)
3120 return(1); /* Dead, cant ack any more so why bother */
3121
3122 /*
3123 * Have we discovered a larger window
3124 */
3125
3126 ack = ntohl(th->ack_seq);
3127
3128 if (ntohs(th->window) > sk->max_window)
3129 {
3130 sk->max_window = ntohs(th->window);
3131 #ifdef CONFIG_INET_PCTCP
3132 /* Hack because we don't send partial packets to non SWS
3133 handling hosts */
3134 sk->mss = min(sk->max_window>>1, sk->mtu);
3135 #else
3136 sk->mss = min(sk->max_window, sk->mtu);
3137 #endif
3138 }
3139
3140 /*
3141 * We have dropped back to keepalive timeouts. Thus we have
3142 * no retransmits pending.
3143 */
3144
3145 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3146 sk->retransmits = 0;
3147
3148 /*
3149 * If the ack is newer than sent or older than previous acks
3150 * then we can probably ignore it.
3151 */
3152
3153 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3154 {
3155 if(sk->debug)
3156 printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3157
3158 /*
3159 * Keepalive processing.
3160 */
3161
3162 if (after(ack, sk->sent_seq))
3163 {
3164 return(0);
3165 }
3166
3167 /*
3168 * Restart the keepalive timer.
3169 */
3170
3171 if (sk->keepopen)
3172 {
3173 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3174 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3175 }
3176 return(1);
3177 }
3178
3179 /*
3180 * If there is data set flag 1
3181 */
3182
3183 if (len != th->doff*4)
3184 flag |= 1;
3185
3186 /*
3187 * See if our window has been shrunk.
3188 */
3189
3190 if (after(sk->window_seq, ack+ntohs(th->window)))
3191 {
3192 /*
3193 * We may need to move packets from the send queue
3194 * to the write queue, if the window has been shrunk on us.
3195 * The RFC says you are not allowed to shrink your window
3196 * like this, but if the other end does, you must be able
3197 * to deal with it.
3198 */
3199 struct sk_buff *skb;
3200 struct sk_buff *skb2;
3201 struct sk_buff *wskb = NULL;
3202
3203 skb2 = sk->send_head;
3204 sk->send_head = NULL;
3205 sk->send_tail = NULL;
3206
3207 /*
3208 * This is an artifact of a flawed concept. We want one
3209 * queue and a smarter send routine when we send all.
3210 */
3211
3212 flag |= 4; /* Window changed */
3213
3214 sk->window_seq = ack + ntohs(th->window);
3215 cli();
3216 while (skb2 != NULL)
3217 {
3218 skb = skb2;
3219 skb2 = skb->link3;
3220 skb->link3 = NULL;
3221 if (after(skb->h.seq, sk->window_seq))
3222 {
3223 if (sk->packets_out > 0)
3224 sk->packets_out--;
3225 /* We may need to remove this from the dev send list. */
3226 if (skb->next != NULL)
3227 {
3228 skb_unlink(skb);
3229 }
3230 /* Now add it to the write_queue. */
3231 if (wskb == NULL)
3232 skb_queue_head(&sk->write_queue,skb);
3233 else
3234 skb_append(wskb,skb);
3235 wskb = skb;
3236 }
3237 else
3238 {
3239 if (sk->send_head == NULL)
3240 {
3241 sk->send_head = skb;
3242 sk->send_tail = skb;
3243 }
3244 else
3245 {
3246 sk->send_tail->link3 = skb;
3247 sk->send_tail = skb;
3248 }
3249 skb->link3 = NULL;
3250 }
3251 }
3252 sti();
3253 }
3254
3255 /*
3256 * Pipe has emptied
3257 */
3258
3259 if (sk->send_tail == NULL || sk->send_head == NULL)
3260 {
3261 sk->send_head = NULL;
3262 sk->send_tail = NULL;
3263 sk->packets_out= 0;
3264 }
3265
3266 /*
3267 * Update the right hand window edge of the host
3268 */
3269
3270 sk->window_seq = ack + ntohs(th->window);
3271
3272 /*
3273 * We don't want too many packets out there.
3274 */
3275
3276 if (sk->ip_xmit_timeout == TIME_WRITE &&
3277 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3278 {
3279 /*
3280 * This is Jacobson's slow start and congestion avoidance.
3281 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
3282 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
3283 * counter and increment it once every cwnd times. It's possible
3284 * that this should be done only if sk->retransmits == 0. I'm
3285 * interpreting "new data is acked" as including data that has
3286 * been retransmitted but is just now being acked.
3287 */
3288 if (sk->cong_window < sk->ssthresh)
3289 /*
3290 * In "safe" area, increase
3291 */
3292 sk->cong_window++;
3293 else
3294 {
3295 /*
3296 * In dangerous area, increase slowly. In theory this is
3297 * sk->cong_window += 1 / sk->cong_window
3298 */
3299 if (sk->cong_count >= sk->cong_window)
3300 {
3301 sk->cong_window++;
3302 sk->cong_count = 0;
3303 }
3304 else
3305 sk->cong_count++;
3306 }
3307 }
3308
3309 /*
3310 * Remember the highest ack received.
3311 */
3312
3313 sk->rcv_ack_seq = ack;
3314
3315 /*
3316 * If this ack opens up a zero window, clear backoff. It was
3317 * being used to time the probes, and is probably far higher than
3318 * it needs to be for normal retransmission.
3319 */
3320
3321 if (sk->ip_xmit_timeout == TIME_PROBE0)
3322 {
3323 sk->retransmits = 0; /* Our probe was answered */
3324
3325 /*
3326 * Was it a usable window open ?
3327 */
3328
3329 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
3330 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3331 {
3332 sk->backoff = 0;
3333
3334 /*
3335 * Recompute rto from rtt. this eliminates any backoff.
3336 */
3337
3338 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3339 if (sk->rto > 120*HZ)
3340 sk->rto = 120*HZ;
3341 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about
3342 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3343 .2 of a second is going to need huge windows (SIGH) */
3344 sk->rto = 20;
3345 }
3346 }
3347
3348 /*
3349 * See if we can take anything off of the retransmit queue.
3350 */
3351
3352 while(sk->send_head != NULL)
3353 {
3354 /* Check for a bug. */
3355 if (sk->send_head->link3 &&
3356 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3357 printk("INET: tcp.c: *** bug send_list out of order.\n");
3358
3359 /*
3360 * If our packet is before the ack sequence we can
3361 * discard it as it's confirmed to have arrived the other end.
3362 */
3363
3364 if (before(sk->send_head->h.seq, ack+1))
3365 {
3366 struct sk_buff *oskb;
3367 if (sk->retransmits)
3368 {
3369 /*
3370 * We were retransmitting. don't count this in RTT est
3371 */
3372 flag |= 2;
3373
3374 /*
3375 * even though we've gotten an ack, we're still
3376 * retransmitting as long as we're sending from
3377 * the retransmit queue. Keeping retransmits non-zero
3378 * prevents us from getting new data interspersed with
3379 * retransmissions.
3380 */
3381
3382 if (sk->send_head->link3) /* Any more queued retransmits? */
3383 sk->retransmits = 1;
3384 else
3385 sk->retransmits = 0;
3386 }
3387 /*
3388 * Note that we only reset backoff and rto in the
3389 * rtt recomputation code. And that doesn't happen
3390 * if there were retransmissions in effect. So the
3391 * first new packet after the retransmissions is
3392 * sent with the backoff still in effect. Not until
3393 * we get an ack from a non-retransmitted packet do
3394 * we reset the backoff and rto. This allows us to deal
3395 * with a situation where the network delay has increased
3396 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3397 */
3398
3399 /*
3400 * We have one less packet out there.
3401 */
3402
3403 if (sk->packets_out > 0)
3404 sk->packets_out --;
3405 /*
3406 * Wake up the process, it can probably write more.
3407 */
3408 if (!sk->dead)
3409 sk->write_space(sk);
3410 oskb = sk->send_head;
3411
3412 if (!(flag&2)) /* Not retransmitting */
3413 {
3414 long m;
3415
3416 /*
3417 * The following amusing code comes from Jacobson's
3418 * article in SIGCOMM '88. Note that rtt and mdev
3419 * are scaled versions of rtt and mean deviation.
3420 * This is designed to be as fast as possible
3421 * m stands for "measurement".
3422 */
3423
3424 m = jiffies - oskb->when; /* RTT */
3425 if(m<=0)
3426 m=1; /* IS THIS RIGHT FOR <0 ??? */
3427 m -= (sk->rtt >> 3); /* m is now error in rtt est */
3428 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
3429 if (m < 0)
3430 m = -m; /* m is now abs(error) */
3431 m -= (sk->mdev >> 2); /* similar update on mdev */
3432 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
3433
3434 /*
3435 * Now update timeout. Note that this removes any backoff.
3436 */
3437
3438 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3439 if (sk->rto > 120*HZ)
3440 sk->rto = 120*HZ;
3441 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3442 sk->rto = 20;
3443 sk->backoff = 0;
3444 }
3445 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
3446 In this case as we just set it up */
3447 cli();
3448 oskb = sk->send_head;
3449 IS_SKB(oskb);
3450 sk->send_head = oskb->link3;
3451 if (sk->send_head == NULL)
3452 {
3453 sk->send_tail = NULL;
3454 }
3455
3456 /*
3457 * We may need to remove this from the dev send list.
3458 */
3459
3460 if (oskb->next)
3461 skb_unlink(oskb);
3462 sti();
3463 kfree_skb(oskb, FREE_WRITE); /* write. */
3464 if (!sk->dead)
3465 sk->write_space(sk);
3466 }
3467 else
3468 {
3469 break;
3470 }
3471 }
3472
3473 /*
3474 * XXX someone ought to look at this too.. at the moment, if skb_peek()
3475 * returns non-NULL, we complete ignore the timer stuff in the else
3476 * clause. We ought to organize the code so that else clause can
3477 * (should) be executed regardless, possibly moving the PROBE timer
3478 * reset over. The skb_peek() thing should only move stuff to the
3479 * write queue, NOT also manage the timer functions.
3480 */
3481
3482 /*
3483 * Maybe we can take some stuff off of the write queue,
3484 * and put it onto the xmit queue.
3485 */
3486 if (skb_peek(&sk->write_queue) != NULL)
3487 {
3488 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3489 (sk->retransmits == 0 ||
3490 sk->ip_xmit_timeout != TIME_WRITE ||
3491 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3492 && sk->packets_out < sk->cong_window)
3493 {
3494 /*
3495 * Add more data to the send queue.
3496 */
3497 flag |= 1;
3498 tcp_write_xmit(sk);
3499 }
3500 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3501 sk->send_head == NULL &&
3502 sk->ack_backlog == 0 &&
3503 sk->state != TCP_TIME_WAIT)
3504 {
3505 /*
3506 * Data to queue but no room.
3507 */
3508 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3509 }
3510 }
3511 else
3512 {
3513 /*
3514 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3515 * from TCP_CLOSE we don't do anything
3516 *
3517 * from anything else, if there is write data (or fin) pending,
3518 * we use a TIME_WRITE timeout, else if keepalive we reset to
3519 * a KEEPALIVE timeout, else we delete the timer.
3520 *
3521 * We do not set flag for nominal write data, otherwise we may
3522 * force a state where we start to write itsy bitsy tidbits
3523 * of data.
3524 */
3525
3526 switch(sk->state) {
3527 case TCP_TIME_WAIT:
3528 /*
3529 * keep us in TIME_WAIT until we stop getting packets,
3530 * reset the timeout.
3531 */
3532 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3533 break;
3534 case TCP_CLOSE:
3535 /*
3536 * don't touch the timer.
3537 */
3538 break;
3539 default:
3540 /*
3541 * Must check send_head, write_queue, and ack_backlog
3542 * to determine which timeout to use.
3543 */
3544 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3545 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3546 } else if (sk->keepopen) {
3547 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3548 } else {
3549 del_timer(&sk->retransmit_timer);
3550 sk->ip_xmit_timeout = 0;
3551 }
3552 break;
3553 }
3554 }
3555
3556 /*
3557 * We have nothing queued but space to send. Send any partial
3558 * packets immediately (end of Nagle rule application).
3559 */
3560
3561 if (sk->packets_out == 0 && sk->partial != NULL &&
3562 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3563 {
3564 flag |= 1;
3565 tcp_send_partial(sk);
3566 }
3567
3568 /*
3569 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
3570 * we are now waiting for an acknowledge to our FIN. The other end is
3571 * already in TIME_WAIT.
3572 *
3573 * Move to TCP_CLOSE on success.
3574 */
3575
3576 if (sk->state == TCP_LAST_ACK)
3577 {
3578 if (!sk->dead)
3579 sk->state_change(sk);
3580 if(sk->debug)
3581 printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3582 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3583 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
3584 {
3585 flag |= 1;
3586 tcp_set_state(sk,TCP_CLOSE);
3587 sk->shutdown = SHUTDOWN_MASK;
3588 }
3589 }
3590
3591 /*
3592 * Incoming ACK to a FIN we sent in the case of our initiating the close.
3593 *
3594 * Move to FIN_WAIT2 to await a FIN from the other end. Set
3595 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3596 */
3597
3598 if (sk->state == TCP_FIN_WAIT1)
3599 {
3600
3601 if (!sk->dead)
3602 sk->state_change(sk);
3603 if (sk->rcv_ack_seq == sk->write_seq)
3604 {
3605 flag |= 1;
3606 sk->shutdown |= SEND_SHUTDOWN;
3607 tcp_set_state(sk, TCP_FIN_WAIT2);
3608 }
3609 }
3610
3611 /*
3612 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
3613 *
3614 * Move to TIME_WAIT
3615 */
3616
3617 if (sk->state == TCP_CLOSING)
3618 {
3619
3620 if (!sk->dead)
3621 sk->state_change(sk);
3622 if (sk->rcv_ack_seq == sk->write_seq)
3623 {
3624 flag |= 1;
3625 tcp_time_wait(sk);
3626 }
3627 }
3628
3629 /*
3630 * Final ack of a three way shake
3631 */
3632
3633 if(sk->state==TCP_SYN_RECV)
3634 {
3635 tcp_set_state(sk, TCP_ESTABLISHED);
3636 tcp_options(sk,th);
3637 sk->dummy_th.dest=th->source;
3638 sk->copied_seq = sk->acked_seq;
3639 if(!sk->dead)
3640 sk->state_change(sk);
3641 if(sk->max_window==0)
3642 {
3643 sk->max_window=32; /* Sanity check */
3644 sk->mss=min(sk->max_window,sk->mtu);
3645 }
3646 }
3647
3648 /*
3649 * I make no guarantees about the first clause in the following
3650 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
3651 * what conditions "!flag" would be true. However I think the rest
3652 * of the conditions would prevent that from causing any
3653 * unnecessary retransmission.
3654 * Clearly if the first packet has expired it should be
3655 * retransmitted. The other alternative, "flag&2 && retransmits", is
3656 * harder to explain: You have to look carefully at how and when the
3657 * timer is set and with what timeout. The most recent transmission always
3658 * sets the timer. So in general if the most recent thing has timed
3659 * out, everything before it has as well. So we want to go ahead and
3660 * retransmit some more. If we didn't explicitly test for this
3661 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3662 * would not be true. If you look at the pattern of timing, you can
3663 * show that rto is increased fast enough that the next packet would
3664 * almost never be retransmitted immediately. Then you'd end up
3665 * waiting for a timeout to send each packet on the retransmission
3666 * queue. With my implementation of the Karn sampling algorithm,
3667 * the timeout would double each time. The net result is that it would
3668 * take a hideous amount of time to recover from a single dropped packet.
3669 * It's possible that there should also be a test for TIME_WRITE, but
3670 * I think as long as "send_head != NULL" and "retransmit" is on, we've
3671 * got to be in real retransmission mode.
3672 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
3673 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3674 * As long as no further losses occur, this seems reasonable.
3675 */
3676
3677 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3678 (((flag&2) && sk->retransmits) ||
3679 (sk->send_head->when + sk->rto < jiffies)))
3680 {
3681 if(sk->send_head->when + sk->rto < jiffies)
3682 tcp_retransmit(sk,0);
3683 else
3684 {
3685 tcp_do_retransmit(sk, 1);
3686 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3687 }
3688 }
3689
3690 return(1);
3691 }
3692
3693
3694 /*
3695 * Process the FIN bit. This now behaves as it is supposed to work
3696 * and the FIN takes effect when it is validly part of sequence
3697 * space. Not before when we get holes.
3698 *
3699 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3700 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
3701 * TIME-WAIT)
3702 *
3703 * If we are in FINWAIT-1, a received FIN indicates simultaneous
3704 * close and we go into CLOSING (and later onto TIME-WAIT)
3705 *
3706 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3707 *
3708 */
3709
3710 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3711 {
3712 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3713
3714 if (!sk->dead)
3715 {
3716 sk->state_change(sk);
3717 sock_wake_async(sk->socket, 1);
3718 }
3719
3720 switch(sk->state)
3721 {
3722 case TCP_SYN_RECV:
3723 case TCP_SYN_SENT:
3724 case TCP_ESTABLISHED:
3725 /*
3726 * move to CLOSE_WAIT, tcp_data() already handled
3727 * sending the ack.
3728 */
3729 tcp_set_state(sk,TCP_CLOSE_WAIT);
3730 if (th->rst)
3731 sk->shutdown = SHUTDOWN_MASK;
3732 break;
3733
3734 case TCP_CLOSE_WAIT:
3735 case TCP_CLOSING:
3736 /*
3737 * received a retransmission of the FIN, do
3738 * nothing.
3739 */
3740 break;
3741 case TCP_TIME_WAIT:
3742 /*
3743 * received a retransmission of the FIN,
3744 * restart the TIME_WAIT timer.
3745 */
3746 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3747 return(0);
3748 case TCP_FIN_WAIT1:
3749 /*
3750 * This case occurs when a simultaneous close
3751 * happens, we must ack the received FIN and
3752 * enter the CLOSING state.
3753 *
3754 * This causes a WRITE timeout, which will either
3755 * move on to TIME_WAIT when we timeout, or resend
3756 * the FIN properly (maybe we get rid of that annoying
3757 * FIN lost hang). The TIME_WRITE code is already correct
3758 * for handling this timeout.
3759 */
3760
3761 if(sk->ip_xmit_timeout != TIME_WRITE)
3762 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3763 tcp_set_state(sk,TCP_CLOSING);
3764 break;
3765 case TCP_FIN_WAIT2:
3766 /*
3767 * received a FIN -- send ACK and enter TIME_WAIT
3768 */
3769 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3770 sk->shutdown|=SHUTDOWN_MASK;
3771 tcp_set_state(sk,TCP_TIME_WAIT);
3772 break;
3773 case TCP_CLOSE:
3774 /*
3775 * already in CLOSE
3776 */
3777 break;
3778 default:
3779 tcp_set_state(sk,TCP_LAST_ACK);
3780
3781 /* Start the timers. */
3782 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3783 return(0);
3784 }
3785
3786 return(0);
3787 }
3788
3789
3790
3791 /*
3792 * This routine handles the data. If there is room in the buffer,
3793 * it will be have already been moved into it. If there is no
3794 * room, then we will just have to discard the packet.
3795 */
3796
3797 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3798 unsigned long saddr, unsigned short len)
3799 {
3800 struct sk_buff *skb1, *skb2;
3801 struct tcphdr *th;
3802 int dup_dumped=0;
3803 unsigned long new_seq;
3804 unsigned long shut_seq;
3805
3806 th = skb->h.th;
3807 skb->len = len -(th->doff*4);
3808
3809 /*
3810 * The bytes in the receive read/assembly queue has increased. Needed for the
3811 * low memory discard algorithm
3812 */
3813
3814 sk->bytes_rcv += skb->len;
3815
3816 if (skb->len == 0 && !th->fin)
3817 {
3818 /*
3819 * Don't want to keep passing ack's back and forth.
3820 * (someone sent us dataless, boring frame)
3821 */
3822 if (!th->ack)
3823 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3824 kfree_skb(skb, FREE_READ);
3825 return(0);
3826 }
3827
3828 /*
3829 * We no longer have anyone receiving data on this connection.
3830 */
3831
3832 #ifndef TCP_DONT_RST_SHUTDOWN
3833
3834 if(sk->shutdown & RCV_SHUTDOWN)
3835 {
3836 /*
3837 * FIXME: BSD has some magic to avoid sending resets to
3838 * broken 4.2 BSD keepalives. Much to my surprise a few non
3839 * BSD stacks still have broken keepalives so we want to
3840 * cope with it.
3841 */
3842
3843 if(skb->len) /* We don't care if it's just an ack or
3844 a keepalive/window probe */
3845 {
3846 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
3847
3848 /* Do this the way 4.4BSD treats it. Not what I'd
3849 regard as the meaning of the spec but it's what BSD
3850 does and clearly they know everything 8) */
3851
3852 /*
3853 * This is valid because of two things
3854 *
3855 * a) The way tcp_data behaves at the bottom.
3856 * b) A fin takes effect when read not when received.
3857 */
3858
3859 shut_seq=sk->acked_seq+1; /* Last byte */
3860
3861 if(after(new_seq,shut_seq))
3862 {
3863 if(sk->debug)
3864 printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3865 sk, new_seq, shut_seq, sk->blog);
3866 if(sk->dead)
3867 {
3868 sk->acked_seq = new_seq + th->fin;
3869 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3870 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3871 tcp_statistics.TcpEstabResets++;
3872 tcp_set_state(sk,TCP_CLOSE);
3873 sk->err = EPIPE;
3874 sk->shutdown = SHUTDOWN_MASK;
3875 kfree_skb(skb, FREE_READ);
3876 return 0;
3877 }
3878 }
3879 }
3880 }
3881
3882 #endif
3883
3884 /*
3885 * Now we have to walk the chain, and figure out where this one
3886 * goes into it. This is set up so that the last packet we received
3887 * will be the first one we look at, that way if everything comes
3888 * in order, there will be no performance loss, and if they come
3889 * out of order we will be able to fit things in nicely.
3890 *
3891 * [AC: This is wrong. We should assume in order first and then walk
3892 * forwards from the first hole based upon real traffic patterns.]
3893 *
3894 */
3895
3896 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
3897 {
3898 skb_queue_head(&sk->receive_queue,skb);
3899 skb1= NULL;
3900 }
3901 else
3902 {
3903 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3904 {
3905 if(sk->debug)
3906 {
3907 printk("skb1=%p :", skb1);
3908 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3909 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3910 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3911 sk->acked_seq);
3912 }
3913
3914 /*
3915 * Optimisation: Duplicate frame or extension of previous frame from
3916 * same sequence point (lost ack case).
3917 * The frame contains duplicate data or replaces a previous frame
3918 * discard the previous frame (safe as sk->inuse is set) and put
3919 * the new one in its place.
3920 */
3921
3922 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3923 {
3924 skb_append(skb1,skb);
3925 skb_unlink(skb1);
3926 kfree_skb(skb1,FREE_READ);
3927 dup_dumped=1;
3928 skb1=NULL;
3929 break;
3930 }
3931
3932 /*
3933 * Found where it fits
3934 */
3935
3936 if (after(th->seq+1, skb1->h.th->seq))
3937 {
3938 skb_append(skb1,skb);
3939 break;
3940 }
3941
3942 /*
3943 * See if we've hit the start. If so insert.
3944 */
3945 if (skb1 == skb_peek(&sk->receive_queue))
3946 {
3947 skb_queue_head(&sk->receive_queue, skb);
3948 break;
3949 }
3950 }
3951 }
3952
3953 /*
3954 * Figure out what the ack value for this frame is
3955 */
3956
3957 th->ack_seq = th->seq + skb->len;
3958 if (th->syn)
3959 th->ack_seq++;
3960 if (th->fin)
3961 th->ack_seq++;
3962
3963 if (before(sk->acked_seq, sk->copied_seq))
3964 {
3965 printk("*** tcp.c:tcp_data bug acked < copied\n");
3966 sk->acked_seq = sk->copied_seq;
3967 }
3968
3969 /*
3970 * Now figure out if we can ack anything. This is very messy because we really want two
3971 * receive queues, a completed and an assembly queue. We also want only one transmit
3972 * queue.
3973 */
3974
3975 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3976 {
3977 if (before(th->seq, sk->acked_seq+1))
3978 {
3979 int newwindow;
3980
3981 if (after(th->ack_seq, sk->acked_seq))
3982 {
3983 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3984 if (newwindow < 0)
3985 newwindow = 0;
3986 sk->window = newwindow;
3987 sk->acked_seq = th->ack_seq;
3988 }
3989 skb->acked = 1;
3990
3991 /*
3992 * When we ack the fin, we do the FIN
3993 * processing.
3994 */
3995
3996 if (skb->h.th->fin)
3997 {
3998 tcp_fin(skb,sk,skb->h.th);
3999 }
4000
4001 for(skb2 = skb->next;
4002 skb2 != (struct sk_buff *)&sk->receive_queue;
4003 skb2 = skb2->next)
4004 {
4005 if (before(skb2->h.th->seq, sk->acked_seq+1))
4006 {
4007 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4008 {
4009 newwindow = sk->window -
4010 (skb2->h.th->ack_seq - sk->acked_seq);
4011 if (newwindow < 0)
4012 newwindow = 0;
4013 sk->window = newwindow;
4014 sk->acked_seq = skb2->h.th->ack_seq;
4015 }
4016 skb2->acked = 1;
4017 /*
4018 * When we ack the fin, we do
4019 * the fin handling.
4020 */
4021 if (skb2->h.th->fin)
4022 {
4023 tcp_fin(skb,sk,skb->h.th);
4024 }
4025
4026 /*
4027 * Force an immediate ack.
4028 */
4029
4030 sk->ack_backlog = sk->max_ack_backlog;
4031 }
4032 else
4033 {
4034 break;
4035 }
4036 }
4037
4038 /*
4039 * This also takes care of updating the window.
4040 * This if statement needs to be simplified.
4041 */
4042 if (!sk->delay_acks ||
4043 sk->ack_backlog >= sk->max_ack_backlog ||
4044 sk->bytes_rcv > sk->max_unacked || th->fin) {
4045 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4046 }
4047 else
4048 {
4049 sk->ack_backlog++;
4050 if(sk->debug)
4051 printk("Ack queued.\n");
4052 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4053 }
4054 }
4055 }
4056
4057 /*
4058 * If we've missed a packet, send an ack.
4059 * Also start a timer to send another.
4060 */
4061
4062 if (!skb->acked)
4063 {
4064
4065 /*
4066 * This is important. If we don't have much room left,
4067 * we need to throw out a few packets so we have a good
4068 * window. Note that mtu is used, not mss, because mss is really
4069 * for the send side. He could be sending us stuff as large as mtu.
4070 */
4071
4072 while (sk->prot->rspace(sk) < sk->mtu)
4073 {
4074 skb1 = skb_peek(&sk->receive_queue);
4075 if (skb1 == NULL)
4076 {
4077 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4078 break;
4079 }
4080
4081 /*
4082 * Don't throw out something that has been acked.
4083 */
4084
4085 if (skb1->acked)
4086 {
4087 break;
4088 }
4089
4090 skb_unlink(skb1);
4091 kfree_skb(skb1, FREE_READ);
4092 }
4093 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4094 sk->ack_backlog++;
4095 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4096 }
4097 else
4098 {
4099 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4100 }
4101
4102 /*
4103 * Now tell the user we may have some data.
4104 */
4105
4106 if (!sk->dead)
4107 {
4108 if(sk->debug)
4109 printk("Data wakeup.\n");
4110 sk->data_ready(sk,0);
4111 }
4112 return(0);
4113 }
4114
4115
4116 /*
4117 * This routine is only called when we have urgent data
4118 * signalled. Its the 'slow' part of tcp_urg. It could be
4119 * moved inline now as tcp_urg is only called from one
4120 * place. We handle URGent data wrong. We have to - as
4121 * BSD still doesn't use the correction from RFC961.
4122 */
4123
4124 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4125 {
4126 unsigned long ptr = ntohs(th->urg_ptr);
4127
4128 if (ptr)
4129 ptr--;
4130 ptr += th->seq;
4131
4132 /* ignore urgent data that we've already seen and read */
4133 if (after(sk->copied_seq, ptr))
4134 return;
4135
4136 /* do we already have a newer (or duplicate) urgent pointer? */
4137 if (sk->urg_data && !after(ptr, sk->urg_seq))
4138 return;
4139
4140 /* tell the world about our new urgent pointer */
4141 if (sk->proc != 0) {
4142 if (sk->proc > 0) {
4143 kill_proc(sk->proc, SIGURG, 1);
4144 } else {
4145 kill_pg(-sk->proc, SIGURG, 1);
4146 }
4147 }
4148 sk->urg_data = URG_NOTYET;
4149 sk->urg_seq = ptr;
4150 }
4151
4152 /*
4153 * This is the 'fast' part of urgent handling.
4154 */
4155
4156 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4157 unsigned long saddr, unsigned long len)
4158 {
4159 unsigned long ptr;
4160
4161 /*
4162 * Check if we get a new urgent pointer - normally not
4163 */
4164
4165 if (th->urg)
4166 tcp_check_urg(sk,th);
4167
4168 /*
4169 * Do we wait for any urgent data? - normally not
4170 */
4171
4172 if (sk->urg_data != URG_NOTYET)
4173 return 0;
4174
4175 /*
4176 * Is the urgent pointer pointing into this packet?
4177 */
4178
4179 ptr = sk->urg_seq - th->seq + th->doff*4;
4180 if (ptr >= len)
4181 return 0;
4182
4183 /*
4184 * Ok, got the correct packet, update info
4185 */
4186
4187 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4188 if (!sk->dead)
4189 sk->data_ready(sk,0);
4190 return 0;
4191 }
4192
4193 /*
4194 * This will accept the next outstanding connection.
4195 */
4196
4197 static struct sock *tcp_accept(struct sock *sk, int flags)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4198 {
4199 struct sock *newsk;
4200 struct sk_buff *skb;
4201
4202 /*
4203 * We need to make sure that this socket is listening,
4204 * and that it has something pending.
4205 */
4206
4207 if (sk->state != TCP_LISTEN)
4208 {
4209 sk->err = EINVAL;
4210 return(NULL);
4211 }
4212
4213 /* Avoid the race. */
4214 cli();
4215 sk->inuse = 1;
4216
4217 while((skb = tcp_dequeue_established(sk)) == NULL)
4218 {
4219 if (flags & O_NONBLOCK)
4220 {
4221 sti();
4222 release_sock(sk);
4223 sk->err = EAGAIN;
4224 return(NULL);
4225 }
4226
4227 release_sock(sk);
4228 interruptible_sleep_on(sk->sleep);
4229 if (current->signal & ~current->blocked)
4230 {
4231 sti();
4232 sk->err = ERESTARTSYS;
4233 return(NULL);
4234 }
4235 sk->inuse = 1;
4236 }
4237 sti();
4238
4239 /*
4240 * Now all we need to do is return skb->sk.
4241 */
4242
4243 newsk = skb->sk;
4244
4245 kfree_skb(skb, FREE_READ);
4246 sk->ack_backlog--;
4247 release_sock(sk);
4248 return(newsk);
4249 }
4250
4251
4252 /*
4253 * This will initiate an outgoing connection.
4254 */
4255
4256 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4257 {
4258 struct sk_buff *buff;
4259 struct device *dev=NULL;
4260 unsigned char *ptr;
4261 int tmp;
4262 int atype;
4263 struct tcphdr *t1;
4264 struct rtable *rt;
4265
4266 if (sk->state != TCP_CLOSE)
4267 {
4268 return(-EISCONN);
4269 }
4270
4271 if (addr_len < 8)
4272 return(-EINVAL);
4273
4274 if (usin->sin_family && usin->sin_family != AF_INET)
4275 return(-EAFNOSUPPORT);
4276
4277 /*
4278 * connect() to INADDR_ANY means loopback (BSD'ism).
4279 */
4280
4281 if(usin->sin_addr.s_addr==INADDR_ANY)
4282 usin->sin_addr.s_addr=ip_my_addr();
4283
4284 /*
4285 * Don't want a TCP connection going to a broadcast address
4286 */
4287
4288 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4289 return -ENETUNREACH;
4290
4291 sk->inuse = 1;
4292 sk->daddr = usin->sin_addr.s_addr;
4293 sk->write_seq = tcp_init_seq();
4294 sk->window_seq = sk->write_seq;
4295 sk->rcv_ack_seq = sk->write_seq -1;
4296 sk->err = 0;
4297 sk->dummy_th.dest = usin->sin_port;
4298 release_sock(sk);
4299
4300 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4301 if (buff == NULL)
4302 {
4303 return(-ENOMEM);
4304 }
4305 sk->inuse = 1;
4306 buff->len = 24;
4307 buff->sk = sk;
4308 buff->free = 0;
4309 buff->localroute = sk->localroute;
4310
4311 t1 = (struct tcphdr *) buff->data;
4312
4313 /*
4314 * Put in the IP header and routing stuff.
4315 */
4316
4317 rt=ip_rt_route(sk->daddr, NULL, NULL);
4318
4319
4320 /*
4321 * We need to build the routing stuff from the things saved in skb.
4322 */
4323
4324 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4325 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4326 if (tmp < 0)
4327 {
4328 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4329 release_sock(sk);
4330 return(-ENETUNREACH);
4331 }
4332
4333 buff->len += tmp;
4334 t1 = (struct tcphdr *)((char *)t1 +tmp);
4335
4336 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4337 t1->seq = ntohl(sk->write_seq++);
4338 sk->sent_seq = sk->write_seq;
4339 buff->h.seq = sk->write_seq;
4340 t1->ack = 0;
4341 t1->window = 2;
4342 t1->res1=0;
4343 t1->res2=0;
4344 t1->rst = 0;
4345 t1->urg = 0;
4346 t1->psh = 0;
4347 t1->syn = 1;
4348 t1->urg_ptr = 0;
4349 t1->doff = 6;
4350 /* use 512 or whatever user asked for */
4351
4352 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4353 sk->window_clamp=rt->rt_window;
4354 else
4355 sk->window_clamp=0;
4356
4357 if (sk->user_mss)
4358 sk->mtu = sk->user_mss;
4359 else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4360 sk->mtu = rt->rt_mss;
4361 else
4362 {
4363 #ifdef CONFIG_INET_SNARL
4364 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4365 #else
4366 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4367 #endif
4368 sk->mtu = 576 - HEADER_SIZE;
4369 else
4370 sk->mtu = MAX_WINDOW;
4371 }
4372 /*
4373 * but not bigger than device MTU
4374 */
4375
4376 if(sk->mtu <32)
4377 sk->mtu = 32; /* Sanity limit */
4378
4379 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4380
4381 /*
4382 * Put in the TCP options to say MTU.
4383 */
4384
4385 ptr = (unsigned char *)(t1+1);
4386 ptr[0] = 2;
4387 ptr[1] = 4;
4388 ptr[2] = (sk->mtu) >> 8;
4389 ptr[3] = (sk->mtu) & 0xff;
4390 tcp_send_check(t1, sk->saddr, sk->daddr,
4391 sizeof(struct tcphdr) + 4, sk);
4392
4393 /*
4394 * This must go first otherwise a really quick response will get reset.
4395 */
4396
4397 tcp_set_state(sk,TCP_SYN_SENT);
4398 sk->rto = TCP_TIMEOUT_INIT;
4399 #if 0 /* we already did this */
4400 init_timer(&sk->retransmit_timer);
4401 #endif
4402 sk->retransmit_timer.function=&retransmit_timer;
4403 sk->retransmit_timer.data = (unsigned long)sk;
4404 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
4405 sk->retransmits = TCP_SYN_RETRIES;
4406
4407 sk->prot->queue_xmit(sk, dev, buff, 0);
4408 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4409 tcp_statistics.TcpActiveOpens++;
4410 tcp_statistics.TcpOutSegs++;
4411
4412 release_sock(sk);
4413 return(0);
4414 }
4415
4416
4417 /* This functions checks to see if the tcp header is actually acceptable. */
4418 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4419 struct options *opt, unsigned long saddr, struct device *dev)
4420 {
4421 unsigned long next_seq;
4422
4423 next_seq = len - 4*th->doff;
4424 if (th->fin)
4425 next_seq++;
4426 /* if we have a zero window, we can't have any data in the packet.. */
4427 if (next_seq && !sk->window)
4428 goto ignore_it;
4429 next_seq += th->seq;
4430
4431 /*
4432 * This isn't quite right. sk->acked_seq could be more recent
4433 * than sk->window. This is however close enough. We will accept
4434 * slightly more packets than we should, but it should not cause
4435 * problems unless someone is trying to forge packets.
4436 */
4437
4438 /* have we already seen all of this packet? */
4439 if (!after(next_seq+1, sk->acked_seq))
4440 goto ignore_it;
4441 /* or does it start beyond the window? */
4442 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4443 goto ignore_it;
4444
4445 /* ok, at least part of this packet would seem interesting.. */
4446 return 1;
4447
4448 ignore_it:
4449 if (th->rst)
4450 return 0;
4451
4452 /*
4453 * Send a reset if we get something not ours and we are
4454 * unsynchronized. Note: We don't do anything to our end. We
4455 * are just killing the bogus remote connection then we will
4456 * connect again and it will work (with luck).
4457 */
4458
4459 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4460 {
4461 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4462 return 1;
4463 }
4464
4465 /* Try to resync things. */
4466 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4467 return 0;
4468 }
4469
4470 /*
4471 * When we get a reset we do this.
4472 */
4473
4474 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4475 {
4476 sk->zapped = 1;
4477 sk->err = ECONNRESET;
4478 if (sk->state == TCP_SYN_SENT)
4479 sk->err = ECONNREFUSED;
4480 if (sk->state == TCP_CLOSE_WAIT)
4481 sk->err = EPIPE;
4482 #ifdef TCP_DO_RFC1337
4483 /*
4484 * Time wait assassination protection [RFC1337]
4485 */
4486 if(sk->state!=TCP_TIME_WAIT)
4487 {
4488 tcp_set_state(sk,TCP_CLOSE);
4489 sk->shutdown = SHUTDOWN_MASK;
4490 }
4491 #else
4492 tcp_set_state(sk,TCP_CLOSE);
4493 sk->shutdown = SHUTDOWN_MASK;
4494 #endif
4495 if (!sk->dead)
4496 sk->state_change(sk);
4497 kfree_skb(skb, FREE_READ);
4498 release_sock(sk);
4499 return(0);
4500 }
4501
4502 /*
4503 * A TCP packet has arrived.
4504 */
4505
4506 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4507 unsigned long daddr, unsigned short len,
4508 unsigned long saddr, int redo, struct inet_protocol * protocol)
4509 {
4510 struct tcphdr *th;
4511 struct sock *sk;
4512 int syn_ok=0;
4513
4514 if (!skb)
4515 {
4516 printk("IMPOSSIBLE 1\n");
4517 return(0);
4518 }
4519
4520 if (!dev)
4521 {
4522 printk("IMPOSSIBLE 2\n");
4523 return(0);
4524 }
4525
4526 tcp_statistics.TcpInSegs++;
4527
4528 if(skb->pkt_type!=PACKET_HOST)
4529 {
4530 kfree_skb(skb,FREE_READ);
4531 return(0);
4532 }
4533
4534 th = skb->h.th;
4535
4536 /*
4537 * Find the socket.
4538 */
4539
4540 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4541
4542 /*
4543 * If this socket has got a reset it's to all intents and purposes
4544 * really dead. Count closed sockets as dead.
4545 *
4546 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4547 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
4548 * exist so should cause resets as if the port was unreachable.
4549 */
4550
4551 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4552 sk=NULL;
4553
4554 if (!redo)
4555 {
4556 if (tcp_check(th, len, saddr, daddr ))
4557 {
4558 skb->sk = NULL;
4559 kfree_skb(skb,FREE_READ);
4560 /*
4561 * We don't release the socket because it was
4562 * never marked in use.
4563 */
4564 return(0);
4565 }
4566 th->seq = ntohl(th->seq);
4567
4568 /* See if we know about the socket. */
4569 if (sk == NULL)
4570 {
4571 /*
4572 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4573 */
4574 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4575 skb->sk = NULL;
4576 /*
4577 * Discard frame
4578 */
4579 kfree_skb(skb, FREE_READ);
4580 return(0);
4581 }
4582
4583 skb->len = len;
4584 skb->acked = 0;
4585 skb->used = 0;
4586 skb->free = 0;
4587 skb->saddr = daddr;
4588 skb->daddr = saddr;
4589
4590 /* We may need to add it to the backlog here. */
4591 cli();
4592 if (sk->inuse)
4593 {
4594 skb_queue_tail(&sk->back_log, skb);
4595 sti();
4596 return(0);
4597 }
4598 sk->inuse = 1;
4599 sti();
4600 }
4601 else
4602 {
4603 if (sk==NULL)
4604 {
4605 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4606 skb->sk = NULL;
4607 kfree_skb(skb, FREE_READ);
4608 return(0);
4609 }
4610 }
4611
4612
4613 if (!sk->prot)
4614 {
4615 printk("IMPOSSIBLE 3\n");
4616 return(0);
4617 }
4618
4619
4620 /*
4621 * Charge the memory to the socket.
4622 */
4623
4624 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
4625 {
4626 kfree_skb(skb, FREE_READ);
4627 release_sock(sk);
4628 return(0);
4629 }
4630
4631 skb->sk=sk;
4632 sk->rmem_alloc += skb->mem_len;
4633
4634 /*
4635 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4636 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4637 * compatibility. We also set up variables more thoroughly [Karn notes in the
4638 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4639 */
4640
4641 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
4642 {
4643
4644 /*
4645 * Now deal with unusual cases.
4646 */
4647
4648 if(sk->state==TCP_LISTEN)
4649 {
4650 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
4651 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4652
4653 /*
4654 * We don't care for RST, and non SYN are absorbed (old segments)
4655 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4656 * netmask on a running connection it can go broadcast. Even Sun's have
4657 * this problem so I'm ignoring it
4658 */
4659
4660 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4661 {
4662 kfree_skb(skb, FREE_READ);
4663 release_sock(sk);
4664 return 0;
4665 }
4666
4667 /*
4668 * Guess we need to make a new socket up
4669 */
4670
4671 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4672
4673 /*
4674 * Now we have several options: In theory there is nothing else
4675 * in the frame. KA9Q has an option to send data with the syn,
4676 * BSD accepts data with the syn up to the [to be] advertised window
4677 * and Solaris 2.1 gives you a protocol error. For now we just ignore
4678 * it, that fits the spec precisely and avoids incompatibilities. It
4679 * would be nice in future to drop through and process the data.
4680 */
4681
4682 release_sock(sk);
4683 return 0;
4684 }
4685
4686 /* retransmitted SYN? */
4687 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4688 {
4689 kfree_skb(skb, FREE_READ);
4690 release_sock(sk);
4691 return 0;
4692 }
4693
4694 /*
4695 * SYN sent means we have to look for a suitable ack and either reset
4696 * for bad matches or go to connected
4697 */
4698
4699 if(sk->state==TCP_SYN_SENT)
4700 {
4701 /* Crossed SYN or previous junk segment */
4702 if(th->ack)
4703 {
4704 /* We got an ack, but it's not a good ack */
4705 if(!tcp_ack(sk,th,saddr,len))
4706 {
4707 /* Reset the ack - its an ack from a
4708 different connection [ th->rst is checked in tcp_reset()] */
4709 tcp_statistics.TcpAttemptFails++;
4710 tcp_reset(daddr, saddr, th,
4711 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4712 kfree_skb(skb, FREE_READ);
4713 release_sock(sk);
4714 return(0);
4715 }
4716 if(th->rst)
4717 return tcp_std_reset(sk,skb);
4718 if(!th->syn)
4719 {
4720 /* A valid ack from a different connection
4721 start. Shouldn't happen but cover it */
4722 kfree_skb(skb, FREE_READ);
4723 release_sock(sk);
4724 return 0;
4725 }
4726 /*
4727 * Ok.. it's good. Set up sequence numbers and
4728 * move to established.
4729 */
4730 syn_ok=1; /* Don't reset this connection for the syn */
4731 sk->acked_seq=th->seq+1;
4732 sk->fin_seq=th->seq;
4733 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4734 tcp_set_state(sk, TCP_ESTABLISHED);
4735 tcp_options(sk,th);
4736 sk->dummy_th.dest=th->source;
4737 sk->copied_seq = sk->acked_seq;
4738 if(!sk->dead)
4739 {
4740 sk->state_change(sk);
4741 sock_wake_async(sk->socket, 0);
4742 }
4743 if(sk->max_window==0)
4744 {
4745 sk->max_window = 32;
4746 sk->mss = min(sk->max_window, sk->mtu);
4747 }
4748 }
4749 else
4750 {
4751 /* See if SYN's cross. Drop if boring */
4752 if(th->syn && !th->rst)
4753 {
4754 /* Crossed SYN's are fine - but talking to
4755 yourself is right out... */
4756 if(sk->saddr==saddr && sk->daddr==daddr &&
4757 sk->dummy_th.source==th->source &&
4758 sk->dummy_th.dest==th->dest)
4759 {
4760 tcp_statistics.TcpAttemptFails++;
4761 return tcp_std_reset(sk,skb);
4762 }
4763 tcp_set_state(sk,TCP_SYN_RECV);
4764
4765 /*
4766 * FIXME:
4767 * Must send SYN|ACK here
4768 */
4769 }
4770 /* Discard junk segment */
4771 kfree_skb(skb, FREE_READ);
4772 release_sock(sk);
4773 return 0;
4774 }
4775 /*
4776 * SYN_RECV with data maybe.. drop through
4777 */
4778 goto rfc_step6;
4779 }
4780
4781 /*
4782 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4783 * a more complex suggestion for fixing these reuse issues in RFC1644
4784 * but not yet ready for general use. Also see RFC1379.
4785 */
4786
4787 #define BSD_TIME_WAIT
4788 #ifdef BSD_TIME_WAIT
4789 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4790 after(th->seq, sk->acked_seq) && !th->rst)
4791 {
4792 long seq=sk->write_seq;
4793 if(sk->debug)
4794 printk("Doing a BSD time wait\n");
4795 tcp_statistics.TcpEstabResets++;
4796 sk->rmem_alloc -= skb->mem_len;
4797 skb->sk = NULL;
4798 sk->err=ECONNRESET;
4799 tcp_set_state(sk, TCP_CLOSE);
4800 sk->shutdown = SHUTDOWN_MASK;
4801 release_sock(sk);
4802 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4803 if (sk && sk->state==TCP_LISTEN)
4804 {
4805 sk->inuse=1;
4806 skb->sk = sk;
4807 sk->rmem_alloc += skb->mem_len;
4808 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4809 release_sock(sk);
4810 return 0;
4811 }
4812 kfree_skb(skb, FREE_READ);
4813 return 0;
4814 }
4815 #endif
4816 }
4817
4818 /*
4819 * We are now in normal data flow (see the step list in the RFC)
4820 * Note most of these are inline now. I'll inline the lot when
4821 * I have time to test it hard and look at what gcc outputs
4822 */
4823
4824 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4825 {
4826 kfree_skb(skb, FREE_READ);
4827 release_sock(sk);
4828 return 0;
4829 }
4830
4831 if(th->rst)
4832 return tcp_std_reset(sk,skb);
4833
4834 /*
4835 * !syn_ok is effectively the state test in RFC793.
4836 */
4837
4838 if(th->syn && !syn_ok)
4839 {
4840 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4841 return tcp_std_reset(sk,skb);
4842 }
4843
4844 /*
4845 * Process the ACK
4846 */
4847
4848
4849 if(th->ack && !tcp_ack(sk,th,saddr,len))
4850 {
4851 /*
4852 * Our three way handshake failed.
4853 */
4854
4855 if(sk->state==TCP_SYN_RECV)
4856 {
4857 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4858 }
4859 kfree_skb(skb, FREE_READ);
4860 release_sock(sk);
4861 return 0;
4862 }
4863
4864 rfc_step6: /* I'll clean this up later */
4865
4866 /*
4867 * Process urgent data
4868 */
4869
4870 if(tcp_urg(sk, th, saddr, len))
4871 {
4872 kfree_skb(skb, FREE_READ);
4873 release_sock(sk);
4874 return 0;
4875 }
4876
4877
4878 /*
4879 * Process the encapsulated data
4880 */
4881
4882 if(tcp_data(skb,sk, saddr, len))
4883 {
4884 kfree_skb(skb, FREE_READ);
4885 release_sock(sk);
4886 return 0;
4887 }
4888
4889 /*
4890 * And done
4891 */
4892
4893 release_sock(sk);
4894 return 0;
4895 }
4896
4897 /*
4898 * This routine sends a packet with an out of date sequence
4899 * number. It assumes the other end will try to ack it.
4900 */
4901
4902 static void tcp_write_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4903 {
4904 struct sk_buff *buff;
4905 struct tcphdr *t1;
4906 struct device *dev=NULL;
4907 int tmp;
4908
4909 if (sk->zapped)
4910 return; /* After a valid reset we can send no more */
4911
4912 /*
4913 * Write data can still be transmitted/retransmitted in the
4914 * following states. If any other state is encountered, return.
4915 * [listen/close will never occur here anyway]
4916 */
4917
4918 if (sk->state != TCP_ESTABLISHED &&
4919 sk->state != TCP_CLOSE_WAIT &&
4920 sk->state != TCP_FIN_WAIT1 &&
4921 sk->state != TCP_LAST_ACK &&
4922 sk->state != TCP_CLOSING
4923 )
4924 {
4925 return;
4926 }
4927
4928 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4929 if (buff == NULL)
4930 return;
4931
4932 buff->len = sizeof(struct tcphdr);
4933 buff->free = 1;
4934 buff->sk = sk;
4935 buff->localroute = sk->localroute;
4936
4937 t1 = (struct tcphdr *) buff->data;
4938
4939 /* Put in the IP header and routing stuff. */
4940 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4941 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4942 if (tmp < 0)
4943 {
4944 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4945 return;
4946 }
4947
4948 buff->len += tmp;
4949 t1 = (struct tcphdr *)((char *)t1 +tmp);
4950
4951 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4952
4953 /*
4954 * Use a previous sequence.
4955 * This should cause the other end to send an ack.
4956 */
4957
4958 t1->seq = htonl(sk->sent_seq-1);
4959 t1->ack = 1;
4960 t1->res1= 0;
4961 t1->res2= 0;
4962 t1->rst = 0;
4963 t1->urg = 0;
4964 t1->psh = 0;
4965 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
4966 t1->syn = 0;
4967 t1->ack_seq = ntohl(sk->acked_seq);
4968 t1->window = ntohs(tcp_select_window(sk));
4969 t1->doff = sizeof(*t1)/4;
4970 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4971 /*
4972 * Send it and free it.
4973 * This will prevent the timer from automatically being restarted.
4974 */
4975 sk->prot->queue_xmit(sk, dev, buff, 1);
4976 tcp_statistics.TcpOutSegs++;
4977 }
4978
4979 /*
4980 * A window probe timeout has occurred.
4981 */
4982
4983 void tcp_send_probe0(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4984 {
4985 if (sk->zapped)
4986 return; /* After a valid reset we can send no more */
4987
4988 tcp_write_wakeup(sk);
4989
4990 sk->backoff++;
4991 sk->rto = min(sk->rto << 1, 120*HZ);
4992 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4993 sk->retransmits++;
4994 sk->prot->retransmits ++;
4995 }
4996
4997 /*
4998 * Socket option code for TCP.
4999 */
5000
5001 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5002 {
5003 int val,err;
5004
5005 if(level!=SOL_TCP)
5006 return ip_setsockopt(sk,level,optname,optval,optlen);
5007
5008 if (optval == NULL)
5009 return(-EINVAL);
5010
5011 err=verify_area(VERIFY_READ, optval, sizeof(int));
5012 if(err)
5013 return err;
5014
5015 val = get_fs_long((unsigned long *)optval);
5016
5017 switch(optname)
5018 {
5019 case TCP_MAXSEG:
5020 /*
5021 * values greater than interface MTU won't take effect. however at
5022 * the point when this call is done we typically don't yet know
5023 * which interface is going to be used
5024 */
5025 if(val<1||val>MAX_WINDOW)
5026 return -EINVAL;
5027 sk->user_mss=val;
5028 return 0;
5029 case TCP_NODELAY:
5030 sk->nonagle=(val==0)?0:1;
5031 return 0;
5032 default:
5033 return(-ENOPROTOOPT);
5034 }
5035 }
5036
5037 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5038 {
5039 int val,err;
5040
5041 if(level!=SOL_TCP)
5042 return ip_getsockopt(sk,level,optname,optval,optlen);
5043
5044 switch(optname)
5045 {
5046 case TCP_MAXSEG:
5047 val=sk->user_mss;
5048 break;
5049 case TCP_NODELAY:
5050 val=sk->nonagle;
5051 break;
5052 default:
5053 return(-ENOPROTOOPT);
5054 }
5055 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5056 if(err)
5057 return err;
5058 put_fs_long(sizeof(int),(unsigned long *) optlen);
5059
5060 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5061 if(err)
5062 return err;
5063 put_fs_long(val,(unsigned long *)optval);
5064
5065 return(0);
5066 }
5067
5068
5069 struct proto tcp_prot = {
5070 sock_wmalloc,
5071 sock_rmalloc,
5072 sock_wfree,
5073 sock_rfree,
5074 sock_rspace,
5075 sock_wspace,
5076 tcp_close,
5077 tcp_read,
5078 tcp_write,
5079 tcp_sendto,
5080 tcp_recvfrom,
5081 ip_build_header,
5082 tcp_connect,
5083 tcp_accept,
5084 ip_queue_xmit,
5085 tcp_retransmit,
5086 tcp_write_wakeup,
5087 tcp_read_wakeup,
5088 tcp_rcv,
5089 tcp_select,
5090 tcp_ioctl,
5091 NULL,
5092 tcp_shutdown,
5093 tcp_setsockopt,
5094 tcp_getsockopt,
5095 128,
5096 0,
5097 {NULL,},
5098 "TCP",
5099 0, 0
5100 };