1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1
26 * and was trying to connect (tcp_err()).
27 * Alan Cox : All icmp error handling was broken
28 * pointers passed where wrong and the
29 * socket was looked up backwards. Nobody
30 * tested any icmp error code obviously.
31 * Alan Cox : tcp_err() now handled properly. It wakes people
32 * on errors. select behaves and the icmp error race
33 * has gone by moving it into sock.c
34 * Alan Cox : tcp_reset() fixed to work for everything not just
35 * packets for unknown sockets.
36 * Alan Cox : tcp option processing.
37 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong]
38 * Herp Rosmanith : More reset fixes
39 * Alan Cox : No longer acks invalid rst frames. Acking
40 * any kind of RST is right out.
41 * Alan Cox : Sets an ignore me flag on an rst receive
42 * otherwise odd bits of prattle escape still
43 * Alan Cox : Fixed another acking RST frame bug. Should stop
44 * LAN workplace lockups.
45 * Alan Cox : Some tidyups using the new skb list facilities
46 * Alan Cox : sk->keepopen now seems to work
47 * Alan Cox : Pulls options out correctly on accepts
48 * Alan Cox : Fixed assorted sk->rqueue->next errors
49 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops.
50 * Alan Cox : Tidied tcp_data to avoid a potential nasty.
51 * Alan Cox : Added some better commenting, as the tcp is hard to follow
52 * Alan Cox : Removed incorrect check for 20 * psh
53 * Michael O'Reilly : ack < copied bug fix.
54 * Johannes Stille : Misc tcp fixes (not all in yet).
55 * Alan Cox : FIN with no memory -> CRASH
56 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept.
57 * Alan Cox : Added TCP options (SOL_TCP)
58 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets.
59 * Alan Cox : Use ip_tos/ip_ttl settings.
60 * Alan Cox : Handle FIN (more) properly (we hope).
61 * Alan Cox : RST frames sent on unsynchronised state ack error/
62 * Alan Cox : Put in missing check for SYN bit.
63 * Alan Cox : Added tcp_select_window() aka NET2E
64 * window non shrink trick.
65 * Alan Cox : Added a couple of small NET2E timer fixes
66 * Charles Hedrick : TCP fixes
67 * Toomas Tamm : TCP window fixes
68 * Alan Cox : Small URG fix to rlogin ^C ack fight
69 * Charles Hedrick : Rewrote most of it to actually work
70 * Linus : Rewrote tcp_read() and URG handling
71 * completely
72 * Gerhard Koerting: Fixed some missing timer handling
73 * Matthew Dillon : Reworked TCP machine states as per RFC
74 * Gerhard Koerting: PC/TCP workarounds
75 * Adam Caldwell : Assorted timer/timing errors
76 * Matthew Dillon : Fixed another RST bug
77 * Alan Cox : Move to kernel side addressing changes.
78 * Alan Cox : Beginning work on TCP fastpathing (not yet usable)
79 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
80 * Alan Cox : TCP fast path debugging
81 * Alan Cox : Window clamping
82 * Michael Riepe : Bug in tcp_check()
83 * Matt Dillon : More TCP improvements and RST bug fixes
84 * Matt Dillon : Yet more small nasties remove from the TCP code
85 * (Be very nice to this man if tcp finally works 100%) 8)
86 * Alan Cox : BSD accept semantics.
87 * Alan Cox : Reset on closedown bug.
88 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
89 * Michael Pall : Handle select() after URG properly in all cases.
90 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
91 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
92 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api.
93 * Alan Cox : Changed the semantics of sk->socket to
94 * fix a race and a signal problem with
95 * accept() and async I/O.
96 * Alan Cox : Relaxed the rules on tcp_sendto().
97 * Yury Shevchuk : Really fixed accept() blocking problem.
98 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
99 * clients/servers which listen in on
100 * fixed ports.
101 * Alan Cox : Cleaned the above up and shrank it to
102 * a sensible code size.
103 * Alan Cox : Self connect lockup fix.
104 * Alan Cox : No connect to multicast.
105 * Ross Biro : Close unaccepted children on master
106 * socket close.
107 * Alan Cox : Reset tracing code.
108 * Alan Cox : Spurious resets on shutdown.
109 * Alan Cox : Giant 15 minute/60 second timer error
110 * Alan Cox : Small whoops in selecting before an accept.
111 * Alan Cox : Kept the state trace facility since it's
112 * handy for debugging.
113 * Alan Cox : More reset handler fixes.
114 * Alan Cox : Started rewriting the code based on the RFC's
115 * for other useful protocol references see:
116 * Comer, KA9Q NOS, and for a reference on the
117 * difference between specifications and how BSD
118 * works see the 4.4lite source.
119 * A.N.Kuznetsov : Don't time wait on completion of tidy
120 * close.
121 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
122 * Linus Torvalds : Fixed BSD port reuse to work first syn
123 * Alan Cox : Reimplemented timers as per the RFC and using multiple
124 * timers for sanity.
125 * Alan Cox : Small bug fixes, and a lot of new
126 * comments.
127 * Alan Cox : Fixed dual reader crash by locking
128 * the buffers (much like datagram.c)
129 * Alan Cox : Fixed stuck sockets in probe. A probe
130 * now gets fed up of retrying without
131 * (even a no space) answer.
132 * Alan Cox : Extracted closing code better
133 * Alan Cox : Fixed the closing state machine to
134 * resemble the RFC.
135 * Alan Cox : More 'per spec' fixes.
136 * Jorge Cwik : Even faster checksumming.
137 * Alan Cox : tcp_data() doesn't ack illegal PSH
138 * only frames. At least one pc tcp stack
139 * generates them.
140 * Alan Cox : Cache last socket.
141 * Alan Cox : Per route irtt.
142 * Matt Day : Select() match BSD precisely on error
143 * Alan Cox : New buffers
144 * Mark Tamsky : Various sk->prot->retransmits and
145 * sk->retransmits misupdating fixed.
146 * Fixed tcp_write_timeout: stuck close,
147 * and TCP syn retries gets used now.
148 *
149 *
150 * To Fix:
151 * Fast path the code. Two things here - fix the window calculation
152 * so it doesn't iterate over the queue, also spot packets with no funny
153 * options arriving in order and process directly.
154 *
155 * Implement RFC 1191 [Path MTU discovery]
156 * Look at the effect of implementing RFC 1337 suggestions and their impact.
157 * Rewrite output state machine to use a single queue and do low window
158 * situations as per the spec (RFC 1122)
159 * Speed up input assembly algorithm.
160 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
161 * could do with it working on IPv4
162 * User settable/learned rtt/max window/mtu
163 * Cope with MTU/device switches when retransmitting in tcp.
164 * Fix the window handling to use PR's new code.
165 *
166 * Change the fundamental structure to a single send queue maintained
167 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on
168 * active routes too]). Cut the queue off in tcp_retransmit/
169 * tcp_transmit.
170 * Change the receive queue to assemble as it goes. This lets us
171 * dispose of most of tcp_sequence, half of tcp_ack and chunks of
172 * tcp_data/tcp_read as well as the window shrink crud.
173 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
174 * tcp_queue_skb seem obvious routines to extract.
175 *
176 * This program is free software; you can redistribute it and/or
177 * modify it under the terms of the GNU General Public License
178 * as published by the Free Software Foundation; either version
179 * 2 of the License, or(at your option) any later version.
180 *
181 * Description of States:
182 *
183 * TCP_SYN_SENT sent a connection request, waiting for ack
184 *
185 * TCP_SYN_RECV received a connection request, sent ack,
186 * waiting for final ack in three-way handshake.
187 *
188 * TCP_ESTABLISHED connection established
189 *
190 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
191 * transmission of remaining buffered data
192 *
193 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
194 * to shutdown
195 *
196 * TCP_CLOSING both sides have shutdown but we still have
197 * data we have to finish sending
198 *
199 * TCP_TIME_WAIT timeout to catch resent junk before entering
200 * closed, can only be entered from FIN_WAIT2
201 * or CLOSING. Required because the other end
202 * may not have gotten our last ACK causing it
203 * to retransmit the data packet (which we ignore)
204 *
205 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
206 * us to finish writing our data and to shutdown
207 * (we have to close() to move on to LAST_ACK)
208 *
209 * TCP_LAST_ACK out side has shutdown after remote has
210 * shutdown. There may still be data in our
211 * buffer that we have to finish sending
212 *
213 * TCP_CLOSE socket is finished
214 */
215
216 #include <linux/types.h>
217 #include <linux/sched.h>
218 #include <linux/mm.h>
219 #include <linux/time.h>
220 #include <linux/string.h>
221 #include <linux/config.h>
222 #include <linux/socket.h>
223 #include <linux/sockios.h>
224 #include <linux/termios.h>
225 #include <linux/in.h>
226 #include <linux/fcntl.h>
227 #include <linux/inet.h>
228 #include <linux/netdevice.h>
229 #include <net/snmp.h>
230 #include <net/ip.h>
231 #include <net/protocol.h>
232 #include <net/icmp.h>
233 #include <net/tcp.h>
234 #include <net/arp.h>
235 #include <linux/skbuff.h>
236 #include <net/sock.h>
237 #include <net/route.h>
238 #include <linux/errno.h>
239 #include <linux/timer.h>
240 #include <asm/system.h>
241 #include <asm/segment.h>
242 #include <linux/mm.h>
243 #include <net/checksum.h>
244
245 /*
246 * The MSL timer is the 'normal' timer.
247 */
248
249 #define reset_msl_timer(x,y,z) reset_timer(x,y,z)
250
251 #define SEQ_TICK 3
252 unsigned long seq_offset;
253 struct tcp_mib tcp_statistics;
254
255 /*
256 * Cached last hit socket
257 */
258
259 volatile unsigned long th_cache_saddr,th_cache_daddr;
260 volatile unsigned short th_cache_dport, th_cache_sport;
261 volatile struct sock *th_cache_sk;
262
263 void tcp_cache_zap(void)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
264 {
265 unsigned long flags;
266 save_flags(flags);
267 cli();
268 th_cache_saddr=0;
269 th_cache_daddr=0;
270 th_cache_dport=0;
271 th_cache_sport=0;
272 th_cache_sk=NULL;
273 restore_flags(flags);
274 }
275
276 static void tcp_close(struct sock *sk, int timeout);
277
278
279 /*
280 * The less said about this the better, but it works and will do for 1.2
281 */
282
283 static struct wait_queue *master_select_wakeup;
284
285 static __inline__ int min(unsigned int a, unsigned int b)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
286 {
287 if (a < b)
288 return(a);
289 return(b);
290 }
291
292 #undef STATE_TRACE
293
294 #ifdef STATE_TRACE
295 static char *statename[]={
296 "Unused","Established","Syn Sent","Syn Recv",
297 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
298 "Close Wait","Last ACK","Listen","Closing"
299 };
300 #endif
301
302 static __inline__ void tcp_set_state(struct sock *sk, int state)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
303 {
304 if(sk->state==TCP_ESTABLISHED)
305 tcp_statistics.TcpCurrEstab--;
306 #ifdef STATE_TRACE
307 if(sk->debug)
308 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
309 #endif
310 /* This is a hack but it doesn't occur often and it's going to
311 be a real to fix nicely */
312
313 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
314 {
315 wake_up_interruptible(&master_select_wakeup);
316 }
317 sk->state=state;
318 if(state==TCP_ESTABLISHED)
319 tcp_statistics.TcpCurrEstab++;
320 }
321
322 /*
323 * This routine picks a TCP windows for a socket based on
324 * the following constraints
325 *
326 * 1. The window can never be shrunk once it is offered (RFC 793)
327 * 2. We limit memory per socket
328 *
329 * For now we use NET2E3's heuristic of offering half the memory
330 * we have handy. All is not as bad as this seems however because
331 * of two things. Firstly we will bin packets even within the window
332 * in order to get the data we are waiting for into the memory limit.
333 * Secondly we bin common duplicate forms at receive time
334 * Better heuristics welcome
335 */
336
337 int tcp_select_window(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
338 {
339 int new_window = sk->prot->rspace(sk);
340
341 if(sk->window_clamp)
342 new_window=min(sk->window_clamp,new_window);
343 /*
344 * Two things are going on here. First, we don't ever offer a
345 * window less than min(sk->mss, MAX_WINDOW/2). This is the
346 * receiver side of SWS as specified in RFC1122.
347 * Second, we always give them at least the window they
348 * had before, in order to avoid retracting window. This
349 * is technically allowed, but RFC1122 advises against it and
350 * in practice it causes trouble.
351 *
352 * Fixme: This doesn't correctly handle the case where
353 * new_window > sk->window but not by enough to allow for the
354 * shift in sequence space.
355 */
356 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
357 return(sk->window);
358 return(new_window);
359 }
360
361 /*
362 * Find someone to 'accept'. Must be called with
363 * sk->inuse=1 or cli()
364 */
365
366 static struct sk_buff *tcp_find_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
367 {
368 struct sk_buff *p=skb_peek(&s->receive_queue);
369 if(p==NULL)
370 return NULL;
371 do
372 {
373 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
374 return p;
375 p=p->next;
376 }
377 while(p!=(struct sk_buff *)&s->receive_queue);
378 return NULL;
379 }
380
381 /*
382 * Remove a completed connection and return it. This is used by
383 * tcp_accept() to get connections from the queue.
384 */
385
386 static struct sk_buff *tcp_dequeue_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
387 {
388 struct sk_buff *skb;
389 unsigned long flags;
390 save_flags(flags);
391 cli();
392 skb=tcp_find_established(s);
393 if(skb!=NULL)
394 skb_unlink(skb); /* Take it off the queue */
395 restore_flags(flags);
396 return skb;
397 }
398
399 /*
400 * This routine closes sockets which have been at least partially
401 * opened, but not yet accepted. Currently it is only called by
402 * tcp_close, and timeout mirrors the value there.
403 */
404
405 static void tcp_close_pending (struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
406 {
407 struct sk_buff *skb;
408
409 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
410 {
411 skb->sk->dead=1;
412 tcp_close(skb->sk, 0);
413 kfree_skb(skb, FREE_READ);
414 }
415 return;
416 }
417
418 /*
419 * Enter the time wait state.
420 */
421
422 static void tcp_time_wait(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
423 {
424 tcp_set_state(sk,TCP_TIME_WAIT);
425 sk->shutdown = SHUTDOWN_MASK;
426 if (!sk->dead)
427 sk->state_change(sk);
428 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
429 }
430
431 /*
432 * A socket has timed out on its send queue and wants to do a
433 * little retransmitting. Currently this means TCP.
434 */
435
436 void tcp_do_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
437 {
438 struct sk_buff * skb;
439 struct proto *prot;
440 struct device *dev;
441 int ct=0;
442
443 prot = sk->prot;
444 skb = sk->send_head;
445
446 while (skb != NULL)
447 {
448 struct tcphdr *th;
449 struct iphdr *iph;
450 int size;
451
452 dev = skb->dev;
453 IS_SKB(skb);
454 skb->when = jiffies;
455
456 /*
457 * In general it's OK just to use the old packet. However we
458 * need to use the current ack and window fields. Urg and
459 * urg_ptr could possibly stand to be updated as well, but we
460 * don't keep the necessary data. That shouldn't be a problem,
461 * if the other end is doing the right thing. Since we're
462 * changing the packet, we have to issue a new IP identifier.
463 */
464
465 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
466 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
467 size = skb->len - (((unsigned char *) th) - skb->data);
468
469 /*
470 * Note: We ought to check for window limits here but
471 * currently this is done (less efficiently) elsewhere.
472 * We do need to check for a route change but can't handle
473 * that until we have the new 1.3.x buffers in.
474 *
475 */
476
477 iph->id = htons(ip_id_count++);
478 ip_send_check(iph);
479
480 /*
481 * This is not the right way to handle this. We have to
482 * issue an up to date window and ack report with this
483 * retransmit to keep the odd buggy tcp that relies on
484 * the fact BSD does this happy.
485 * We don't however need to recalculate the entire
486 * checksum, so someone wanting a small problem to play
487 * with might like to implement RFC1141/RFC1624 and speed
488 * this up by avoiding a full checksum.
489 */
490
491 th->ack_seq = ntohl(sk->acked_seq);
492 th->window = ntohs(tcp_select_window(sk));
493 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
494
495 /*
496 * If the interface is (still) up and running, kick it.
497 */
498
499 if (dev->flags & IFF_UP)
500 {
501 /*
502 * If the packet is still being sent by the device/protocol
503 * below then don't retransmit. This is both needed, and good -
504 * especially with connected mode AX.25 where it stops resends
505 * occurring of an as yet unsent anyway frame!
506 * We still add up the counts as the round trip time wants
507 * adjusting.
508 */
509 if (sk && !skb_device_locked(skb))
510 {
511 /* Remove it from any existing driver queue first! */
512 skb_unlink(skb);
513 /* Now queue it */
514 ip_statistics.IpOutRequests++;
515 dev_queue_xmit(skb, dev, sk->priority);
516 }
517 }
518
519 /*
520 * Count retransmissions
521 */
522
523 ct++;
524 sk->prot->retransmits ++;
525 tcp_statistics.TcpRetransSegs++;
526
527
528 /*
529 * Only one retransmit requested.
530 */
531
532 if (!all)
533 break;
534
535 /*
536 * This should cut it off before we send too many packets.
537 */
538
539 if (ct >= sk->cong_window)
540 break;
541 skb = skb->link3;
542 }
543 }
544
545 /*
546 * Reset the retransmission timer
547 */
548
549 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
550 {
551 del_timer(&sk->retransmit_timer);
552 sk->ip_xmit_timeout = why;
553 if((int)when < 0)
554 {
555 when=3;
556 printk("Error: Negative timer in xmit_timer\n");
557 }
558 sk->retransmit_timer.expires=when;
559 add_timer(&sk->retransmit_timer);
560 }
561
562 /*
563 * This is the normal code called for timeouts. It does the retransmission
564 * and then does backoff. tcp_do_retransmit is separated out because
565 * tcp_ack needs to send stuff from the retransmit queue without
566 * initiating a backoff.
567 */
568
569
570 void tcp_retransmit_time(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
571 {
572 tcp_do_retransmit(sk, all);
573
574 /*
575 * Increase the timeout each time we retransmit. Note that
576 * we do not increase the rtt estimate. rto is initialized
577 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
578 * that doubling rto each time is the least we can get away with.
579 * In KA9Q, Karn uses this for the first few times, and then
580 * goes to quadratic. netBSD doubles, but only goes up to *64,
581 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
582 * defined in the protocol as the maximum possible RTT. I guess
583 * we'll have to use something other than TCP to talk to the
584 * University of Mars.
585 *
586 * PAWS allows us longer timeouts and large windows, so once
587 * implemented ftp to mars will work nicely. We will have to fix
588 * the 120 second clamps though!
589 */
590
591 sk->retransmits++;
592 sk->prot->retransmits++;
593 sk->backoff++;
594 sk->rto = min(sk->rto << 1, 120*HZ);
595 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
596 }
597
598
599 /*
600 * A timer event has trigger a tcp retransmit timeout. The
601 * socket xmit queue is ready and set up to send. Because
602 * the ack receive code keeps the queue straight we do
603 * nothing clever here.
604 */
605
606 static void tcp_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
607 {
608 if (all)
609 {
610 tcp_retransmit_time(sk, all);
611 return;
612 }
613
614 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
615 /* sk->ssthresh in theory can be zero. I guess that's OK */
616 sk->cong_count = 0;
617
618 sk->cong_window = 1;
619
620 /* Do the actual retransmit. */
621 tcp_retransmit_time(sk, all);
622 }
623
624 /*
625 * A write timeout has occurred. Process the after effects.
626 */
627
628 static int tcp_write_timeout(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
629 {
630 /*
631 * Look for a 'soft' timeout.
632 */
633 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
634 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
635 {
636 /*
637 * Attempt to recover if arp has changed (unlikely!) or
638 * a route has shifted (not supported prior to 1.3).
639 */
640 arp_destroy (sk->daddr, 0);
641 /*ip_route_check (sk->daddr);*/
642 }
643
644 /*
645 * Have we tried to SYN too many times (repent repent 8))
646 */
647
648 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
649 {
650 sk->err=ETIMEDOUT;
651 sk->error_report(sk);
652 del_timer(&sk->retransmit_timer);
653 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */
654 tcp_set_state(sk,TCP_CLOSE);
655 /* Don't FIN, we got nothing back */
656 release_sock(sk);
657 return 0;
658 }
659 /*
660 * Has it gone just too far ?
661 */
662 if (sk->retransmits > TCP_RETR2)
663 {
664 sk->err = ETIMEDOUT;
665 sk->error_report(sk);
666 del_timer(&sk->retransmit_timer);
667 /*
668 * Time wait the socket
669 */
670 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
671 {
672 tcp_set_state(sk,TCP_TIME_WAIT);
673 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
674 }
675 else
676 {
677 /*
678 * Clean up time.
679 */
680 tcp_set_state(sk, TCP_CLOSE);
681 release_sock(sk);
682 return 0;
683 }
684 }
685 return 1;
686 }
687
688 /*
689 * The TCP retransmit timer. This lacks a few small details.
690 *
691 * 1. An initial rtt timeout on the probe0 should cause what we can
692 * of the first write queue buffer to be split and sent.
693 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
694 * ETIMEDOUT if we know an additional 'soft' error caused this.
695 * tcp_err should save a 'soft error' for us.
696 */
697
698 static void retransmit_timer(unsigned long data)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
699 {
700 struct sock *sk = (struct sock*)data;
701 int why = sk->ip_xmit_timeout;
702
703 /*
704 * only process if socket is not in use
705 */
706
707 cli();
708 if (sk->inuse || in_bh)
709 {
710 /* Try again in 1 second */
711 sk->retransmit_timer.expires = HZ;
712 add_timer(&sk->retransmit_timer);
713 sti();
714 return;
715 }
716
717 sk->inuse = 1;
718 sti();
719
720 /* Always see if we need to send an ack. */
721
722 if (sk->ack_backlog && !sk->zapped)
723 {
724 sk->prot->read_wakeup (sk);
725 if (! sk->dead)
726 sk->data_ready(sk,0);
727 }
728
729 /* Now we need to figure out why the socket was on the timer. */
730
731 switch (why)
732 {
733 /* Window probing */
734 case TIME_PROBE0:
735 tcp_send_probe0(sk);
736 tcp_write_timeout(sk);
737 break;
738 /* Retransmitting */
739 case TIME_WRITE:
740 /* It could be we got here because we needed to send an ack.
741 * So we need to check for that.
742 */
743 {
744 struct sk_buff *skb;
745 unsigned long flags;
746
747 save_flags(flags);
748 cli();
749 skb = sk->send_head;
750 if (!skb)
751 {
752 restore_flags(flags);
753 }
754 else
755 {
756 /*
757 * Kicked by a delayed ack. Reset timer
758 * correctly now
759 */
760 if (jiffies < skb->when + sk->rto)
761 {
762 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
763 restore_flags(flags);
764 break;
765 }
766 restore_flags(flags);
767 /*
768 * Retransmission
769 */
770 sk->prot->retransmit (sk, 0);
771 tcp_write_timeout(sk);
772 }
773 break;
774 }
775 /* Sending Keepalives */
776 case TIME_KEEPOPEN:
777 /*
778 * this reset_timer() call is a hack, this is not
779 * how KEEPOPEN is supposed to work.
780 */
781 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
782
783 /* Send something to keep the connection open. */
784 if (sk->prot->write_wakeup)
785 sk->prot->write_wakeup (sk);
786 sk->retransmits++;
787 sk->prot->retransmits++;
788 tcp_write_timeout(sk);
789 break;
790 default:
791 printk ("rexmit_timer: timer expired - reason unknown\n");
792 break;
793 }
794 release_sock(sk);
795 }
796
797 /*
798 * This routine is called by the ICMP module when it gets some
799 * sort of error condition. If err < 0 then the socket should
800 * be closed and the error returned to the user. If err > 0
801 * it's just the icmp type << 8 | icmp code. After adjustment
802 * header points to the first 8 bytes of the tcp header. We need
803 * to find the appropriate port.
804 */
805
806 void tcp_err(int err, unsigned char *header, unsigned long daddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
807 unsigned long saddr, struct inet_protocol *protocol)
808 {
809 struct tcphdr *th;
810 struct sock *sk;
811 struct iphdr *iph=(struct iphdr *)header;
812
813 header+=4*iph->ihl;
814
815
816 th =(struct tcphdr *)header;
817 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
818
819 if (sk == NULL)
820 return;
821
822 if(err<0)
823 {
824 sk->err = -err;
825 sk->error_report(sk);
826 return;
827 }
828
829 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
830 {
831 /*
832 * FIXME:
833 * For now we will just trigger a linear backoff.
834 * The slow start code should cause a real backoff here.
835 */
836 if (sk->cong_window > 4)
837 sk->cong_window--;
838 return;
839 }
840
841 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */
842
843 /*
844 * If we've already connected we will keep trying
845 * until we time out, or the user gives up.
846 */
847
848 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
849 {
850 if (sk->state == TCP_SYN_SENT)
851 {
852 tcp_statistics.TcpAttemptFails++;
853 tcp_set_state(sk,TCP_CLOSE);
854 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
855 }
856 sk->err = icmp_err_convert[err & 0xff].errno;
857 }
858 return;
859 }
860
861
862 /*
863 * Walk down the receive queue counting readable data until we hit the end or we find a gap
864 * in the received data queue (ie a frame missing that needs sending to us). Not
865 * sorting using two queues as data arrives makes life so much harder.
866 */
867
868 static int tcp_readable(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
869 {
870 unsigned long counted;
871 unsigned long amount;
872 struct sk_buff *skb;
873 int sum;
874 unsigned long flags;
875
876 if(sk && sk->debug)
877 printk("tcp_readable: %p - ",sk);
878
879 save_flags(flags);
880 cli();
881 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
882 {
883 restore_flags(flags);
884 if(sk && sk->debug)
885 printk("empty\n");
886 return(0);
887 }
888
889 counted = sk->copied_seq; /* Where we are at the moment */
890 amount = 0;
891
892 /*
893 * Do until a push or until we are out of data.
894 */
895
896 do
897 {
898 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */
899 break;
900 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */
901 if (skb->h.th->syn)
902 sum++;
903 if (sum > 0)
904 { /* Add it up, move on */
905 amount += sum;
906 if (skb->h.th->syn)
907 amount--;
908 counted += sum;
909 }
910 /*
911 * Don't count urg data ... but do it in the right place!
912 * Consider: "old_data (ptr is here) URG PUSH data"
913 * The old code would stop at the first push because
914 * it counted the urg (amount==1) and then does amount--
915 * *after* the loop. This means tcp_readable() always
916 * returned zero if any URG PUSH was in the queue, even
917 * though there was normal data available. If we subtract
918 * the urg data right here, we even get it to work for more
919 * than one URG PUSH skb without normal data.
920 * This means that select() finally works now with urg data
921 * in the queue. Note that rlogin was never affected
922 * because it doesn't use select(); it uses two processes
923 * and a blocking read(). And the queue scan in tcp_read()
924 * was correct. Mike <pall@rz.uni-karlsruhe.de>
925 */
926 if (skb->h.th->urg)
927 amount--; /* don't count urg data */
928 if (amount && skb->h.th->psh) break;
929 skb = skb->next;
930 }
931 while(skb != (struct sk_buff *)&sk->receive_queue);
932
933 restore_flags(flags);
934 if(sk->debug)
935 printk("got %lu bytes.\n",amount);
936 return(amount);
937 }
938
939 /*
940 * LISTEN is a special case for select..
941 */
942 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
943 {
944 if (sel_type == SEL_IN) {
945 int retval;
946
947 sk->inuse = 1;
948 retval = (tcp_find_established(sk) != NULL);
949 release_sock(sk);
950 if (!retval)
951 select_wait(&master_select_wakeup,wait);
952 return retval;
953 }
954 return 0;
955 }
956
957
958 /*
959 * Wait for a TCP event.
960 *
961 * Note that we don't need to set "sk->inuse", as the upper select layers
962 * take care of normal races (between the test and the event) and we don't
963 * go look at any of the socket buffers directly.
964 */
965 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
966 {
967 if (sk->state == TCP_LISTEN)
968 return tcp_listen_select(sk, sel_type, wait);
969
970 switch(sel_type) {
971 case SEL_IN:
972 if (sk->err)
973 return 1;
974 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
975 break;
976
977 if (sk->shutdown & RCV_SHUTDOWN)
978 return 1;
979
980 if (sk->acked_seq == sk->copied_seq)
981 break;
982
983 if (sk->urg_seq != sk->copied_seq ||
984 sk->acked_seq != sk->copied_seq+1 ||
985 sk->urginline || !sk->urg_data)
986 return 1;
987 break;
988
989 case SEL_OUT:
990 if (sk->err)
991 return 1;
992 if (sk->shutdown & SEND_SHUTDOWN)
993 return 0;
994 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
995 break;
996 /*
997 * This is now right thanks to a small fix
998 * by Matt Dillon.
999 */
1000
1001 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1002 break;
1003 return 1;
1004
1005 case SEL_EX:
1006 if (sk->urg_data)
1007 return 1;
1008 break;
1009 }
1010 select_wait(sk->sleep, wait);
1011 return 0;
1012 }
1013
1014 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1015 {
1016 int err;
1017 switch(cmd)
1018 {
1019
1020 case TIOCINQ:
1021 #ifdef FIXME /* FIXME: */
1022 case FIONREAD:
1023 #endif
1024 {
1025 unsigned long amount;
1026
1027 if (sk->state == TCP_LISTEN)
1028 return(-EINVAL);
1029
1030 sk->inuse = 1;
1031 amount = tcp_readable(sk);
1032 release_sock(sk);
1033 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1034 if(err)
1035 return err;
1036 put_user(amount, (int *)arg);
1037 return(0);
1038 }
1039 case SIOCATMARK:
1040 {
1041 int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1042
1043 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1044 if (err)
1045 return err;
1046 put_user(answ,(int *) arg);
1047 return(0);
1048 }
1049 case TIOCOUTQ:
1050 {
1051 unsigned long amount;
1052
1053 if (sk->state == TCP_LISTEN) return(-EINVAL);
1054 amount = sk->prot->wspace(sk);
1055 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1056 if(err)
1057 return err;
1058 put_user(amount, (int *)arg);
1059 return(0);
1060 }
1061 default:
1062 return(-EINVAL);
1063 }
1064 }
1065
1066
1067 /*
1068 * This routine computes a TCP checksum.
1069 *
1070 * Modified January 1995 from a go-faster DOS routine by
1071 * Jorge Cwik <jorge@laser.satlink.net>
1072 */
1073
1074 unsigned short tcp_check(struct tcphdr *th, int len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1075 unsigned long saddr, unsigned long daddr, unsigned long base)
1076 {
1077 return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1078 }
1079
1080
1081
1082 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1083 unsigned long daddr, int len, struct sock *sk)
1084 {
1085 th->check = 0;
1086 th->check = tcp_check(th, len, saddr, daddr,
1087 csum_partial((char *)th,len,0));
1088 return;
1089 }
1090
1091 /*
1092 * This is the main buffer sending routine. We queue the buffer
1093 * having checked it is sane seeming.
1094 */
1095
1096 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1097 {
1098 int size;
1099 struct tcphdr * th = skb->h.th;
1100
1101 /*
1102 * length of packet (not counting length of pre-tcp headers)
1103 */
1104
1105 size = skb->len - ((unsigned char *) th - skb->data);
1106
1107 /*
1108 * Sanity check it..
1109 */
1110
1111 if (size < sizeof(struct tcphdr) || size > skb->len)
1112 {
1113 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1114 skb, skb->data, th, skb->len);
1115 kfree_skb(skb, FREE_WRITE);
1116 return;
1117 }
1118
1119 /*
1120 * If we have queued a header size packet.. (these crash a few
1121 * tcp stacks if ack is not set)
1122 */
1123
1124 if (size == sizeof(struct tcphdr))
1125 {
1126 /* If it's got a syn or fin it's notionally included in the size..*/
1127 if(!th->syn && !th->fin)
1128 {
1129 printk("tcp_send_skb: attempt to queue a bogon.\n");
1130 kfree_skb(skb,FREE_WRITE);
1131 return;
1132 }
1133 }
1134
1135 /*
1136 * Actual processing.
1137 */
1138
1139 tcp_statistics.TcpOutSegs++;
1140 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1141
1142 /*
1143 * We must queue if
1144 *
1145 * a) The right edge of this frame exceeds the window
1146 * b) We are retransmitting (Nagle's rule)
1147 * c) We have too many packets 'in flight'
1148 */
1149
1150 if (after(skb->h.seq, sk->window_seq) ||
1151 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1152 sk->packets_out >= sk->cong_window)
1153 {
1154 /* checksum will be supplied by tcp_write_xmit. So
1155 * we shouldn't need to set it at all. I'm being paranoid */
1156 th->check = 0;
1157 if (skb->next != NULL)
1158 {
1159 printk("tcp_send_partial: next != NULL\n");
1160 skb_unlink(skb);
1161 }
1162 skb_queue_tail(&sk->write_queue, skb);
1163
1164 /*
1165 * If we don't fit we have to start the zero window
1166 * probes. This is broken - we really need to do a partial
1167 * send _first_ (This is what causes the Cisco and PC/TCP
1168 * grief).
1169 */
1170
1171 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1172 sk->send_head == NULL && sk->ack_backlog == 0)
1173 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1174 }
1175 else
1176 {
1177 /*
1178 * This is going straight out
1179 */
1180
1181 th->ack_seq = ntohl(sk->acked_seq);
1182 th->window = ntohs(tcp_select_window(sk));
1183
1184 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1185
1186 sk->sent_seq = sk->write_seq;
1187
1188 /*
1189 * This is mad. The tcp retransmit queue is put together
1190 * by the ip layer. This causes half the problems with
1191 * unroutable FIN's and other things.
1192 */
1193
1194 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1195
1196 /*
1197 * Set for next retransmit based on expected ACK time.
1198 * FIXME: We set this every time which means our
1199 * retransmits are really about a window behind.
1200 */
1201
1202 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1203 }
1204 }
1205
1206 /*
1207 * Locking problems lead us to a messy situation where we can have
1208 * multiple partially complete buffers queued up. This is really bad
1209 * as we don't want to be sending partial buffers. Fix this with
1210 * a semaphore or similar to lock tcp_write per socket.
1211 *
1212 * These routines are pretty self descriptive.
1213 */
1214
1215 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1216 {
1217 struct sk_buff * skb;
1218 unsigned long flags;
1219
1220 save_flags(flags);
1221 cli();
1222 skb = sk->partial;
1223 if (skb) {
1224 sk->partial = NULL;
1225 del_timer(&sk->partial_timer);
1226 }
1227 restore_flags(flags);
1228 return skb;
1229 }
1230
1231 /*
1232 * Empty the partial queue
1233 */
1234
1235 static void tcp_send_partial(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1236 {
1237 struct sk_buff *skb;
1238
1239 if (sk == NULL)
1240 return;
1241 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1242 tcp_send_skb(sk, skb);
1243 }
1244
1245 /*
1246 * Queue a partial frame
1247 */
1248
1249 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1250 {
1251 struct sk_buff * tmp;
1252 unsigned long flags;
1253
1254 save_flags(flags);
1255 cli();
1256 tmp = sk->partial;
1257 if (tmp)
1258 del_timer(&sk->partial_timer);
1259 sk->partial = skb;
1260 init_timer(&sk->partial_timer);
1261 /*
1262 * Wait up to 1 second for the buffer to fill.
1263 */
1264 sk->partial_timer.expires = HZ;
1265 sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1266 sk->partial_timer.data = (unsigned long) sk;
1267 add_timer(&sk->partial_timer);
1268 restore_flags(flags);
1269 if (tmp)
1270 tcp_send_skb(sk, tmp);
1271 }
1272
1273
1274 /*
1275 * This routine sends an ack and also updates the window.
1276 */
1277
1278 static void tcp_send_ack(u32 sequence, u32 ack,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1279 struct sock *sk,
1280 struct tcphdr *th, unsigned long daddr)
1281 {
1282 struct sk_buff *buff;
1283 struct tcphdr *t1;
1284 struct device *dev = NULL;
1285 int tmp;
1286
1287 if(sk->zapped)
1288 return; /* We have been reset, we may not send again */
1289
1290 /*
1291 * We need to grab some memory, and put together an ack,
1292 * and then put it into the queue to be sent.
1293 */
1294
1295 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1296 if (buff == NULL)
1297 {
1298 /*
1299 * Force it to send an ack. We don't have to do this
1300 * (ACK is unreliable) but it's much better use of
1301 * bandwidth on slow links to send a spare ack than
1302 * resend packets.
1303 */
1304
1305 sk->ack_backlog++;
1306 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1307 {
1308 reset_xmit_timer(sk, TIME_WRITE, HZ);
1309 }
1310 return;
1311 }
1312
1313 /*
1314 * Assemble a suitable TCP frame
1315 */
1316
1317 buff->sk = sk;
1318 buff->localroute = sk->localroute;
1319
1320 /*
1321 * Put in the IP header and routing stuff.
1322 */
1323
1324 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1325 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1326 if (tmp < 0)
1327 {
1328 buff->free = 1;
1329 sk->prot->wfree(sk, buff);
1330 return;
1331 }
1332 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1333
1334 memcpy(t1, th, sizeof(*t1));
1335
1336 /*
1337 * Swap the send and the receive.
1338 */
1339
1340 t1->dest = th->source;
1341 t1->source = th->dest;
1342 t1->seq = ntohl(sequence);
1343 t1->ack = 1;
1344 sk->window = tcp_select_window(sk);
1345 t1->window = ntohs(sk->window);
1346 t1->res1 = 0;
1347 t1->res2 = 0;
1348 t1->rst = 0;
1349 t1->urg = 0;
1350 t1->syn = 0;
1351 t1->psh = 0;
1352 t1->fin = 0;
1353
1354 /*
1355 * If we have nothing queued for transmit and the transmit timer
1356 * is on we are just doing an ACK timeout and need to switch
1357 * to a keepalive.
1358 */
1359
1360 if (ack == sk->acked_seq)
1361 {
1362 sk->ack_backlog = 0;
1363 sk->bytes_rcv = 0;
1364 sk->ack_timed = 0;
1365 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1366 && sk->ip_xmit_timeout == TIME_WRITE)
1367 {
1368 if(sk->keepopen) {
1369 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1370 } else {
1371 delete_timer(sk);
1372 }
1373 }
1374 }
1375
1376 /*
1377 * Fill in the packet and send it
1378 */
1379
1380 t1->ack_seq = ntohl(ack);
1381 t1->doff = sizeof(*t1)/4;
1382 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1383 if (sk->debug)
1384 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1385 tcp_statistics.TcpOutSegs++;
1386 sk->prot->queue_xmit(sk, dev, buff, 1);
1387 }
1388
1389
1390 /*
1391 * This routine builds a generic TCP header.
1392 */
1393
1394 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1395 {
1396
1397 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1398 th->seq = htonl(sk->write_seq);
1399 th->psh =(push == 0) ? 1 : 0;
1400 th->doff = sizeof(*th)/4;
1401 th->ack = 1;
1402 th->fin = 0;
1403 sk->ack_backlog = 0;
1404 sk->bytes_rcv = 0;
1405 sk->ack_timed = 0;
1406 th->ack_seq = htonl(sk->acked_seq);
1407 sk->window = tcp_select_window(sk);
1408 th->window = htons(sk->window);
1409
1410 return(sizeof(*th));
1411 }
1412
1413 /*
1414 * This routine copies from a user buffer into a socket,
1415 * and starts the transmit system.
1416 */
1417
1418 static int tcp_write(struct sock *sk, unsigned char *from,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1419 int len, int nonblock, unsigned flags)
1420 {
1421 int copied = 0;
1422 int copy;
1423 int tmp;
1424 struct sk_buff *skb;
1425 struct sk_buff *send_tmp;
1426 struct proto *prot;
1427 struct device *dev = NULL;
1428
1429 sk->inuse=1;
1430 prot = sk->prot;
1431 while(len > 0)
1432 {
1433 if (sk->err)
1434 { /* Stop on an error */
1435 release_sock(sk);
1436 if (copied)
1437 return(copied);
1438 tmp = -sk->err;
1439 sk->err = 0;
1440 return(tmp);
1441 }
1442
1443 /*
1444 * First thing we do is make sure that we are established.
1445 */
1446
1447 if (sk->shutdown & SEND_SHUTDOWN)
1448 {
1449 release_sock(sk);
1450 sk->err = EPIPE;
1451 if (copied)
1452 return(copied);
1453 sk->err = 0;
1454 return(-EPIPE);
1455 }
1456
1457 /*
1458 * Wait for a connection to finish.
1459 */
1460
1461 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1462 {
1463 if (sk->err)
1464 {
1465 release_sock(sk);
1466 if (copied)
1467 return(copied);
1468 tmp = -sk->err;
1469 sk->err = 0;
1470 return(tmp);
1471 }
1472
1473 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1474 {
1475 release_sock(sk);
1476 if (copied)
1477 return(copied);
1478
1479 if (sk->err)
1480 {
1481 tmp = -sk->err;
1482 sk->err = 0;
1483 return(tmp);
1484 }
1485
1486 if (sk->keepopen)
1487 {
1488 send_sig(SIGPIPE, current, 0);
1489 }
1490 return(-EPIPE);
1491 }
1492
1493 if (nonblock || copied)
1494 {
1495 release_sock(sk);
1496 if (copied)
1497 return(copied);
1498 return(-EAGAIN);
1499 }
1500
1501 release_sock(sk);
1502 cli();
1503
1504 if (sk->state != TCP_ESTABLISHED &&
1505 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1506 {
1507 interruptible_sleep_on(sk->sleep);
1508 if (current->signal & ~current->blocked)
1509 {
1510 sti();
1511 if (copied)
1512 return(copied);
1513 return(-ERESTARTSYS);
1514 }
1515 }
1516 sk->inuse = 1;
1517 sti();
1518 }
1519
1520 /*
1521 * The following code can result in copy <= if sk->mss is ever
1522 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
1523 * sk->mtu is constant once SYN processing is finished. I.e. we
1524 * had better not get here until we've seen his SYN and at least one
1525 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
1526 * But ESTABLISHED should guarantee that. sk->max_window is by definition
1527 * non-decreasing. Note that any ioctl to set user_mss must be done
1528 * before the exchange of SYN's. If the initial ack from the other
1529 * end has a window of 0, max_window and thus mss will both be 0.
1530 */
1531
1532 /*
1533 * Now we need to check if we have a half built packet.
1534 */
1535
1536 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1537 {
1538 int hdrlen;
1539
1540 /* IP header + TCP header */
1541 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1542 + sizeof(struct tcphdr);
1543
1544 /* Add more stuff to the end of skb->len */
1545 if (!(flags & MSG_OOB))
1546 {
1547 copy = min(sk->mss - (skb->len - hdrlen), len);
1548 /* FIXME: this is really a bug. */
1549 if (copy <= 0)
1550 {
1551 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1552 copy = 0;
1553 }
1554
1555 memcpy_fromfs(skb_put(skb,copy), from, copy);
1556 from += copy;
1557 copied += copy;
1558 len -= copy;
1559 sk->write_seq += copy;
1560 }
1561 if ((skb->len - hdrlen) >= sk->mss ||
1562 (flags & MSG_OOB) || !sk->packets_out)
1563 tcp_send_skb(sk, skb);
1564 else
1565 tcp_enqueue_partial(skb, sk);
1566 continue;
1567 }
1568
1569 /*
1570 * We also need to worry about the window.
1571 * If window < 1/2 the maximum window we've seen from this
1572 * host, don't use it. This is sender side
1573 * silly window prevention, as specified in RFC1122.
1574 * (Note that this is different than earlier versions of
1575 * SWS prevention, e.g. RFC813.). What we actually do is
1576 * use the whole MSS. Since the results in the right
1577 * edge of the packet being outside the window, it will
1578 * be queued for later rather than sent.
1579 */
1580
1581 copy = sk->window_seq - sk->write_seq;
1582 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1583 copy = sk->mss;
1584 if (copy > len)
1585 copy = len;
1586
1587 /*
1588 * We should really check the window here also.
1589 */
1590
1591 send_tmp = NULL;
1592 if (copy < sk->mss && !(flags & MSG_OOB))
1593 {
1594 /*
1595 * We will release the socket in case we sleep here.
1596 */
1597 release_sock(sk);
1598 /*
1599 * NB: following must be mtu, because mss can be increased.
1600 * mss is always <= mtu
1601 */
1602 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1603 sk->inuse = 1;
1604 send_tmp = skb;
1605 }
1606 else
1607 {
1608 /*
1609 * We will release the socket in case we sleep here.
1610 */
1611 release_sock(sk);
1612 skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1613 sk->inuse = 1;
1614 }
1615
1616 /*
1617 * If we didn't get any memory, we need to sleep.
1618 */
1619
1620 if (skb == NULL)
1621 {
1622 sk->socket->flags |= SO_NOSPACE;
1623 if (nonblock)
1624 {
1625 release_sock(sk);
1626 if (copied)
1627 return(copied);
1628 return(-EAGAIN);
1629 }
1630
1631 /*
1632 * FIXME: here is another race condition.
1633 */
1634
1635 tmp = sk->wmem_alloc;
1636 release_sock(sk);
1637 cli();
1638 /*
1639 * Again we will try to avoid it.
1640 */
1641 if (tmp <= sk->wmem_alloc &&
1642 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1643 && sk->err == 0)
1644 {
1645 sk->socket->flags &= ~SO_NOSPACE;
1646 interruptible_sleep_on(sk->sleep);
1647 if (current->signal & ~current->blocked)
1648 {
1649 sti();
1650 if (copied)
1651 return(copied);
1652 return(-ERESTARTSYS);
1653 }
1654 }
1655 sk->inuse = 1;
1656 sti();
1657 continue;
1658 }
1659
1660 skb->sk = sk;
1661 skb->free = 0;
1662 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1663
1664 /*
1665 * FIXME: we need to optimize this.
1666 * Perhaps some hints here would be good.
1667 */
1668
1669 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1670 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1671 if (tmp < 0 )
1672 {
1673 prot->wfree(sk, skb);
1674 release_sock(sk);
1675 if (copied)
1676 return(copied);
1677 return(tmp);
1678 }
1679 skb->dev = dev;
1680 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1681 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1682 if (tmp < 0)
1683 {
1684 prot->wfree(sk, skb);
1685 release_sock(sk);
1686 if (copied)
1687 return(copied);
1688 return(tmp);
1689 }
1690
1691 if (flags & MSG_OOB)
1692 {
1693 skb->h.th->urg = 1;
1694 skb->h.th->urg_ptr = ntohs(copy);
1695 }
1696
1697 memcpy_fromfs(skb_put(skb,copy), from, copy);
1698
1699 from += copy;
1700 copied += copy;
1701 len -= copy;
1702 skb->free = 0;
1703 sk->write_seq += copy;
1704
1705 if (send_tmp != NULL && sk->packets_out)
1706 {
1707 tcp_enqueue_partial(send_tmp, sk);
1708 continue;
1709 }
1710 tcp_send_skb(sk, skb);
1711 }
1712 sk->err = 0;
1713
1714 /*
1715 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1716 * interactive fast network servers. It's meant to be on and
1717 * it really improves the throughput though not the echo time
1718 * on my slow slip link - Alan
1719 */
1720
1721 /*
1722 * Avoid possible race on send_tmp - c/o Johannes Stille
1723 */
1724
1725 if(sk->partial && ((!sk->packets_out)
1726 /* If not nagling we can send on the before case too.. */
1727 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1728 ))
1729 tcp_send_partial(sk);
1730
1731 release_sock(sk);
1732 return(copied);
1733 }
1734
1735 /*
1736 * This is just a wrapper.
1737 */
1738
1739 static int tcp_sendto(struct sock *sk, unsigned char *from,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1740 int len, int nonblock, unsigned flags,
1741 struct sockaddr_in *addr, int addr_len)
1742 {
1743 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1744 return -EINVAL;
1745 if (sk->state == TCP_CLOSE)
1746 return -ENOTCONN;
1747 if (addr_len < sizeof(*addr))
1748 return -EINVAL;
1749 if (addr->sin_family && addr->sin_family != AF_INET)
1750 return -EINVAL;
1751 if (addr->sin_port != sk->dummy_th.dest)
1752 return -EISCONN;
1753 if (addr->sin_addr.s_addr != sk->daddr)
1754 return -EISCONN;
1755 return tcp_write(sk, from, len, nonblock, flags);
1756 }
1757
1758
1759 /*
1760 * Send an ack if one is backlogged at this point. Ought to merge
1761 * this with tcp_send_ack().
1762 */
1763
1764 static void tcp_read_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1765 {
1766 int tmp;
1767 struct device *dev = NULL;
1768 struct tcphdr *t1;
1769 struct sk_buff *buff;
1770
1771 if (!sk->ack_backlog)
1772 return;
1773
1774 /*
1775 * FIXME: we need to put code here to prevent this routine from
1776 * being called. Being called once in a while is ok, so only check
1777 * if this is the second time in a row.
1778 */
1779
1780 /*
1781 * We need to grab some memory, and put together an ack,
1782 * and then put it into the queue to be sent.
1783 */
1784
1785 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1786 if (buff == NULL)
1787 {
1788 /* Try again real soon. */
1789 reset_xmit_timer(sk, TIME_WRITE, HZ);
1790 return;
1791 }
1792
1793 buff->sk = sk;
1794 buff->localroute = sk->localroute;
1795
1796 /*
1797 * Put in the IP header and routing stuff.
1798 */
1799
1800 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1801 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1802 if (tmp < 0)
1803 {
1804 buff->free = 1;
1805 sk->prot->wfree(sk, buff);
1806 return;
1807 }
1808
1809 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1810
1811 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1812 t1->seq = htonl(sk->sent_seq);
1813 t1->ack = 1;
1814 t1->res1 = 0;
1815 t1->res2 = 0;
1816 t1->rst = 0;
1817 t1->urg = 0;
1818 t1->syn = 0;
1819 t1->psh = 0;
1820 sk->ack_backlog = 0;
1821 sk->bytes_rcv = 0;
1822 sk->window = tcp_select_window(sk);
1823 t1->window = ntohs(sk->window);
1824 t1->ack_seq = ntohl(sk->acked_seq);
1825 t1->doff = sizeof(*t1)/4;
1826 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1827 sk->prot->queue_xmit(sk, dev, buff, 1);
1828 tcp_statistics.TcpOutSegs++;
1829 }
1830
1831
1832 /*
1833 * FIXME:
1834 * This routine frees used buffers.
1835 * It should consider sending an ACK to let the
1836 * other end know we now have a bigger window.
1837 */
1838
1839 static void cleanup_rbuf(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1840 {
1841 unsigned long flags;
1842 unsigned long left;
1843 struct sk_buff *skb;
1844 unsigned long rspace;
1845
1846 if(sk->debug)
1847 printk("cleaning rbuf for sk=%p\n", sk);
1848
1849 save_flags(flags);
1850 cli();
1851
1852 left = sk->prot->rspace(sk);
1853
1854 /*
1855 * We have to loop through all the buffer headers,
1856 * and try to free up all the space we can.
1857 */
1858
1859 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1860 {
1861 if (!skb->used || skb->users)
1862 break;
1863 skb_unlink(skb);
1864 skb->sk = sk;
1865 kfree_skb(skb, FREE_READ);
1866 }
1867
1868 restore_flags(flags);
1869
1870 /*
1871 * FIXME:
1872 * At this point we should send an ack if the difference
1873 * in the window, and the amount of space is bigger than
1874 * TCP_WINDOW_DIFF.
1875 */
1876
1877 if(sk->debug)
1878 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1879 left);
1880 if ((rspace=sk->prot->rspace(sk)) != left)
1881 {
1882 /*
1883 * This area has caused the most trouble. The current strategy
1884 * is to simply do nothing if the other end has room to send at
1885 * least 3 full packets, because the ack from those will auto-
1886 * matically update the window. If the other end doesn't think
1887 * we have much space left, but we have room for at least 1 more
1888 * complete packet than it thinks we do, we will send an ack
1889 * immediately. Otherwise we will wait up to .5 seconds in case
1890 * the user reads some more.
1891 */
1892 sk->ack_backlog++;
1893 /*
1894 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
1895 * if the other end is offering a window smaller than the agreed on MSS
1896 * (called sk->mtu here). In theory there's no connection between send
1897 * and receive, and so no reason to think that they're going to send
1898 * small packets. For the moment I'm using the hack of reducing the mss
1899 * only on the send side, so I'm putting mtu here.
1900 */
1901
1902 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1903 {
1904 /* Send an ack right now. */
1905 tcp_read_wakeup(sk);
1906 }
1907 else
1908 {
1909 /* Force it to send an ack soon. */
1910 int was_active = del_timer(&sk->retransmit_timer);
1911 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1912 {
1913 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1914 }
1915 else
1916 add_timer(&sk->retransmit_timer);
1917 }
1918 }
1919 }
1920
1921
1922 /*
1923 * Handle reading urgent data. BSD has very simple semantics for
1924 * this, no blocking and very strange errors 8)
1925 */
1926
1927 static int tcp_read_urg(struct sock * sk, int nonblock,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1928 unsigned char *to, int len, unsigned flags)
1929 {
1930 /*
1931 * No URG data to read
1932 */
1933 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1934 return -EINVAL; /* Yes this is right ! */
1935
1936 if (sk->err)
1937 {
1938 int tmp = -sk->err;
1939 sk->err = 0;
1940 return tmp;
1941 }
1942
1943 if (sk->state == TCP_CLOSE || sk->done)
1944 {
1945 if (!sk->done) {
1946 sk->done = 1;
1947 return 0;
1948 }
1949 return -ENOTCONN;
1950 }
1951
1952 if (sk->shutdown & RCV_SHUTDOWN)
1953 {
1954 sk->done = 1;
1955 return 0;
1956 }
1957 sk->inuse = 1;
1958 if (sk->urg_data & URG_VALID)
1959 {
1960 char c = sk->urg_data;
1961 if (!(flags & MSG_PEEK))
1962 sk->urg_data = URG_READ;
1963 put_fs_byte(c, to);
1964 release_sock(sk);
1965 return 1;
1966 }
1967 release_sock(sk);
1968
1969 /*
1970 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1971 * the available implementations agree in this case:
1972 * this call should never block, independent of the
1973 * blocking state of the socket.
1974 * Mike <pall@rz.uni-karlsruhe.de>
1975 */
1976 return -EAGAIN;
1977 }
1978
1979
1980 /*
1981 * This routine copies from a sock struct into the user buffer.
1982 */
1983
1984 static int tcp_read(struct sock *sk, unsigned char *to,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1985 int len, int nonblock, unsigned flags)
1986 {
1987 struct wait_queue wait = { current, NULL };
1988 int copied = 0;
1989 u32 peek_seq;
1990 volatile u32 *seq; /* So gcc doesn't overoptimise */
1991 unsigned long used;
1992
1993 /*
1994 * This error should be checked.
1995 */
1996
1997 if (sk->state == TCP_LISTEN)
1998 return -ENOTCONN;
1999
2000 /*
2001 * Urgent data needs to be handled specially.
2002 */
2003
2004 if (flags & MSG_OOB)
2005 return tcp_read_urg(sk, nonblock, to, len, flags);
2006
2007 /*
2008 * Copying sequence to update. This is volatile to handle
2009 * the multi-reader case neatly (memcpy_to/fromfs might be
2010 * inline and thus not flush cached variables otherwise).
2011 */
2012
2013 peek_seq = sk->copied_seq;
2014 seq = &sk->copied_seq;
2015 if (flags & MSG_PEEK)
2016 seq = &peek_seq;
2017
2018 add_wait_queue(sk->sleep, &wait);
2019 sk->inuse = 1;
2020 while (len > 0)
2021 {
2022 struct sk_buff * skb;
2023 u32 offset;
2024
2025 /*
2026 * Are we at urgent data? Stop if we have read anything.
2027 */
2028
2029 if (copied && sk->urg_data && sk->urg_seq == *seq)
2030 break;
2031
2032 /*
2033 * Next get a buffer.
2034 */
2035
2036 current->state = TASK_INTERRUPTIBLE;
2037
2038 skb = skb_peek(&sk->receive_queue);
2039 do
2040 {
2041 if (!skb)
2042 break;
2043 if (before(*seq, skb->h.th->seq))
2044 break;
2045 offset = *seq - skb->h.th->seq;
2046 if (skb->h.th->syn)
2047 offset--;
2048 if (offset < skb->len)
2049 goto found_ok_skb;
2050 if (skb->h.th->fin)
2051 goto found_fin_ok;
2052 if (!(flags & MSG_PEEK))
2053 skb->used = 1;
2054 skb = skb->next;
2055 }
2056 while (skb != (struct sk_buff *)&sk->receive_queue);
2057
2058 if (copied)
2059 break;
2060
2061 if (sk->err)
2062 {
2063 copied = -sk->err;
2064 sk->err = 0;
2065 break;
2066 }
2067
2068 if (sk->state == TCP_CLOSE)
2069 {
2070 if (!sk->done)
2071 {
2072 sk->done = 1;
2073 break;
2074 }
2075 copied = -ENOTCONN;
2076 break;
2077 }
2078
2079 if (sk->shutdown & RCV_SHUTDOWN)
2080 {
2081 sk->done = 1;
2082 break;
2083 }
2084
2085 if (nonblock)
2086 {
2087 copied = -EAGAIN;
2088 break;
2089 }
2090
2091 cleanup_rbuf(sk);
2092 release_sock(sk);
2093 sk->socket->flags |= SO_WAITDATA;
2094 schedule();
2095 sk->socket->flags &= ~SO_WAITDATA;
2096 sk->inuse = 1;
2097
2098 if (current->signal & ~current->blocked)
2099 {
2100 copied = -ERESTARTSYS;
2101 break;
2102 }
2103 continue;
2104
2105 found_ok_skb:
2106 /*
2107 * Lock the buffer. We can be fairly relaxed as
2108 * an interrupt will never steal a buffer we are
2109 * using unless I've missed something serious in
2110 * tcp_data.
2111 */
2112
2113 skb->users++;
2114
2115 /*
2116 * Ok so how much can we use ?
2117 */
2118
2119 used = skb->len - offset;
2120 if (len < used)
2121 used = len;
2122 /*
2123 * Do we have urgent data here?
2124 */
2125
2126 if (sk->urg_data)
2127 {
2128 u32 urg_offset = sk->urg_seq - *seq;
2129 if (urg_offset < used)
2130 {
2131 if (!urg_offset)
2132 {
2133 if (!sk->urginline)
2134 {
2135 ++*seq;
2136 offset++;
2137 used--;
2138 }
2139 }
2140 else
2141 used = urg_offset;
2142 }
2143 }
2144
2145 /*
2146 * Copy it - We _MUST_ update *seq first so that we
2147 * don't ever double read when we have dual readers
2148 */
2149
2150 *seq += used;
2151
2152 /*
2153 * This memcpy_tofs can sleep. If it sleeps and we
2154 * do a second read it relies on the skb->users to avoid
2155 * a crash when cleanup_rbuf() gets called.
2156 */
2157
2158 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2159 skb->h.th->doff*4 + offset, used);
2160 copied += used;
2161 len -= used;
2162 to += used;
2163
2164 /*
2165 * We now will not sleep again until we are finished
2166 * with skb. Sorry if you are doing the SMP port
2167 * but you'll just have to fix it neatly ;)
2168 */
2169
2170 skb->users --;
2171
2172 if (after(sk->copied_seq,sk->urg_seq))
2173 sk->urg_data = 0;
2174 if (used + offset < skb->len)
2175 continue;
2176
2177 /*
2178 * Process the FIN.
2179 */
2180
2181 if (skb->h.th->fin)
2182 goto found_fin_ok;
2183 if (flags & MSG_PEEK)
2184 continue;
2185 skb->used = 1;
2186 continue;
2187
2188 found_fin_ok:
2189 ++*seq;
2190 if (flags & MSG_PEEK)
2191 break;
2192
2193 /*
2194 * All is done
2195 */
2196
2197 skb->used = 1;
2198 sk->shutdown |= RCV_SHUTDOWN;
2199 break;
2200
2201 }
2202 remove_wait_queue(sk->sleep, &wait);
2203 current->state = TASK_RUNNING;
2204
2205 /* Clean up data we have read: This will do ACK frames */
2206 cleanup_rbuf(sk);
2207 release_sock(sk);
2208 return copied;
2209 }
2210
2211 /*
2212 * State processing on a close. This implements the state shift for
2213 * sending our FIN frame. Note that we only send a FIN for some
2214 * states. A shutdown() may have already sent the FIN, or we may be
2215 * closed.
2216 */
2217
2218 static int tcp_close_state(struct sock *sk, int dead)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2219 {
2220 int ns=TCP_CLOSE;
2221 int send_fin=0;
2222 switch(sk->state)
2223 {
2224 case TCP_SYN_SENT: /* No SYN back, no FIN needed */
2225 break;
2226 case TCP_SYN_RECV:
2227 case TCP_ESTABLISHED: /* Closedown begin */
2228 ns=TCP_FIN_WAIT1;
2229 send_fin=1;
2230 break;
2231 case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */
2232 case TCP_FIN_WAIT2:
2233 case TCP_CLOSING:
2234 ns=sk->state;
2235 break;
2236 case TCP_CLOSE:
2237 case TCP_LISTEN:
2238 break;
2239 case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
2240 wait only for the ACK */
2241 ns=TCP_LAST_ACK;
2242 send_fin=1;
2243 }
2244
2245 tcp_set_state(sk,ns);
2246
2247 /*
2248 * This is a (useful) BSD violating of the RFC. There is a
2249 * problem with TCP as specified in that the other end could
2250 * keep a socket open forever with no application left this end.
2251 * We use a 3 minute timeout (about the same as BSD) then kill
2252 * our end. If they send after that then tough - BUT: long enough
2253 * that we won't make the old 4*rto = almost no time - whoops
2254 * reset mistake.
2255 */
2256 if(dead && ns==TCP_FIN_WAIT2)
2257 {
2258 int timer_active=del_timer(&sk->timer);
2259 if(timer_active)
2260 add_timer(&sk->timer);
2261 else
2262 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2263 }
2264
2265 return send_fin;
2266 }
2267
2268 /*
2269 * Send a fin.
2270 */
2271
2272 static void tcp_send_fin(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2273 {
2274 struct proto *prot =(struct proto *)sk->prot;
2275 struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2276 struct tcphdr *t1;
2277 struct sk_buff *buff;
2278 struct device *dev=NULL;
2279 int tmp;
2280
2281 release_sock(sk); /* in case the malloc sleeps. */
2282
2283 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2284 sk->inuse = 1;
2285
2286 if (buff == NULL)
2287 {
2288 /* This is a disaster if it occurs */
2289 printk("tcp_send_fin: Impossible malloc failure");
2290 return;
2291 }
2292
2293 /*
2294 * Administrivia
2295 */
2296
2297 buff->sk = sk;
2298 buff->localroute = sk->localroute;
2299
2300 /*
2301 * Put in the IP header and routing stuff.
2302 */
2303
2304 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2305 IPPROTO_TCP, sk->opt,
2306 sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2307 if (tmp < 0)
2308 {
2309 int t;
2310 /*
2311 * Finish anyway, treat this as a send that got lost.
2312 * (Not good).
2313 */
2314
2315 buff->free = 1;
2316 prot->wfree(sk,buff);
2317 sk->write_seq++;
2318 t=del_timer(&sk->timer);
2319 if(t)
2320 add_timer(&sk->timer);
2321 else
2322 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2323 return;
2324 }
2325
2326 /*
2327 * We ought to check if the end of the queue is a buffer and
2328 * if so simply add the fin to that buffer, not send it ahead.
2329 */
2330
2331 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2332 buff->dev = dev;
2333 memcpy(t1, th, sizeof(*t1));
2334 t1->seq = ntohl(sk->write_seq);
2335 sk->write_seq++;
2336 buff->h.seq = sk->write_seq;
2337 t1->ack = 1;
2338 t1->ack_seq = ntohl(sk->acked_seq);
2339 t1->window = ntohs(sk->window=tcp_select_window(sk));
2340 t1->fin = 1;
2341 t1->rst = 0;
2342 t1->doff = sizeof(*t1)/4;
2343 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2344
2345 /*
2346 * If there is data in the write queue, the fin must be appended to
2347 * the write queue.
2348 */
2349
2350 if (skb_peek(&sk->write_queue) != NULL)
2351 {
2352 buff->free = 0;
2353 if (buff->next != NULL)
2354 {
2355 printk("tcp_send_fin: next != NULL\n");
2356 skb_unlink(buff);
2357 }
2358 skb_queue_tail(&sk->write_queue, buff);
2359 }
2360 else
2361 {
2362 sk->sent_seq = sk->write_seq;
2363 sk->prot->queue_xmit(sk, dev, buff, 0);
2364 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2365 }
2366 }
2367
2368 /*
2369 * Shutdown the sending side of a connection. Much like close except
2370 * that we don't receive shut down or set sk->dead=1.
2371 */
2372
2373 void tcp_shutdown(struct sock *sk, int how)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2374 {
2375 /*
2376 * We need to grab some memory, and put together a FIN,
2377 * and then put it into the queue to be sent.
2378 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2379 */
2380
2381 if (!(how & SEND_SHUTDOWN))
2382 return;
2383
2384 /*
2385 * If we've already sent a FIN, or it's a closed state
2386 */
2387
2388 if (sk->state == TCP_FIN_WAIT1 ||
2389 sk->state == TCP_FIN_WAIT2 ||
2390 sk->state == TCP_CLOSING ||
2391 sk->state == TCP_LAST_ACK ||
2392 sk->state == TCP_TIME_WAIT ||
2393 sk->state == TCP_CLOSE ||
2394 sk->state == TCP_LISTEN
2395 )
2396 {
2397 return;
2398 }
2399 sk->inuse = 1;
2400
2401 /*
2402 * flag that the sender has shutdown
2403 */
2404
2405 sk->shutdown |= SEND_SHUTDOWN;
2406
2407 /*
2408 * Clear out any half completed packets.
2409 */
2410
2411 if (sk->partial)
2412 tcp_send_partial(sk);
2413
2414 /*
2415 * FIN if needed
2416 */
2417
2418 if(tcp_close_state(sk,0))
2419 tcp_send_fin(sk);
2420
2421 release_sock(sk);
2422 }
2423
2424
2425 static int
2426 tcp_recvfrom(struct sock *sk, unsigned char *to,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2427 int to_len, int nonblock, unsigned flags,
2428 struct sockaddr_in *addr, int *addr_len)
2429 {
2430 int result;
2431
2432 /*
2433 * Have to check these first unlike the old code. If
2434 * we check them after we lose data on an error
2435 * which is wrong
2436 */
2437
2438 if(addr_len)
2439 *addr_len = sizeof(*addr);
2440 result=tcp_read(sk, to, to_len, nonblock, flags);
2441
2442 if (result < 0)
2443 return(result);
2444
2445 if(addr)
2446 {
2447 addr->sin_family = AF_INET;
2448 addr->sin_port = sk->dummy_th.dest;
2449 addr->sin_addr.s_addr = sk->daddr;
2450 }
2451 return(result);
2452 }
2453
2454
2455 /*
2456 * This routine will send an RST to the other tcp.
2457 */
2458
2459 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2460 struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2461 {
2462 struct sk_buff *buff;
2463 struct tcphdr *t1;
2464 int tmp;
2465 struct device *ndev=NULL;
2466
2467 /*
2468 * Cannot reset a reset (Think about it).
2469 */
2470
2471 if(th->rst)
2472 return;
2473
2474 /*
2475 * We need to grab some memory, and put together an RST,
2476 * and then put it into the queue to be sent.
2477 */
2478
2479 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2480 if (buff == NULL)
2481 return;
2482
2483 buff->sk = NULL;
2484 buff->dev = dev;
2485 buff->localroute = 0;
2486
2487 /*
2488 * Put in the IP header and routing stuff.
2489 */
2490
2491 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2492 sizeof(struct tcphdr),tos,ttl);
2493 if (tmp < 0)
2494 {
2495 buff->free = 1;
2496 prot->wfree(NULL, buff);
2497 return;
2498 }
2499
2500 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2501 memcpy(t1, th, sizeof(*t1));
2502
2503 /*
2504 * Swap the send and the receive.
2505 */
2506
2507 t1->dest = th->source;
2508 t1->source = th->dest;
2509 t1->rst = 1;
2510 t1->window = 0;
2511
2512 if(th->ack)
2513 {
2514 t1->ack = 0;
2515 t1->seq = th->ack_seq;
2516 t1->ack_seq = 0;
2517 }
2518 else
2519 {
2520 t1->ack = 1;
2521 if(!th->syn)
2522 t1->ack_seq=htonl(th->seq);
2523 else
2524 t1->ack_seq=htonl(th->seq+1);
2525 t1->seq=0;
2526 }
2527
2528 t1->syn = 0;
2529 t1->urg = 0;
2530 t1->fin = 0;
2531 t1->psh = 0;
2532 t1->doff = sizeof(*t1)/4;
2533 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2534 prot->queue_xmit(NULL, ndev, buff, 1);
2535 tcp_statistics.TcpOutSegs++;
2536 }
2537
2538
2539 /*
2540 * Look for tcp options. Parses everything but only knows about MSS.
2541 * This routine is always called with the packet containing the SYN.
2542 * However it may also be called with the ack to the SYN. So you
2543 * can't assume this is always the SYN. It's always called after
2544 * we have set up sk->mtu to our own MTU.
2545 *
2546 * We need at minimum to add PAWS support here. Possibly large windows
2547 * as Linux gets deployed on 100Mb/sec networks.
2548 */
2549
2550 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2551 {
2552 unsigned char *ptr;
2553 int length=(th->doff*4)-sizeof(struct tcphdr);
2554 int mss_seen = 0;
2555
2556 ptr = (unsigned char *)(th + 1);
2557
2558 while(length>0)
2559 {
2560 int opcode=*ptr++;
2561 int opsize=*ptr++;
2562 switch(opcode)
2563 {
2564 case TCPOPT_EOL:
2565 return;
2566 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
2567 length--;
2568 ptr--; /* the opsize=*ptr++ above was a mistake */
2569 continue;
2570
2571 default:
2572 if(opsize<=2) /* Avoid silly options looping forever */
2573 return;
2574 switch(opcode)
2575 {
2576 case TCPOPT_MSS:
2577 if(opsize==4 && th->syn)
2578 {
2579 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2580 mss_seen = 1;
2581 }
2582 break;
2583 /* Add other options here as people feel the urge to implement stuff like large windows */
2584 }
2585 ptr+=opsize-2;
2586 length-=opsize;
2587 }
2588 }
2589 if (th->syn)
2590 {
2591 if (! mss_seen)
2592 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
2593 }
2594 #ifdef CONFIG_INET_PCTCP
2595 sk->mss = min(sk->max_window >> 1, sk->mtu);
2596 #else
2597 sk->mss = min(sk->max_window, sk->mtu);
2598 #endif
2599 }
2600
2601 static inline unsigned long default_mask(unsigned long dst)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2602 {
2603 dst = ntohl(dst);
2604 if (IN_CLASSA(dst))
2605 return htonl(IN_CLASSA_NET);
2606 if (IN_CLASSB(dst))
2607 return htonl(IN_CLASSB_NET);
2608 return htonl(IN_CLASSC_NET);
2609 }
2610
2611 /*
2612 * Default sequence number picking algorithm.
2613 * As close as possible to RFC 793, which
2614 * suggests using a 250kHz clock.
2615 * Further reading shows this assumes 2MB/s networks.
2616 * For 10MB/s ethernet, a 1MHz clock is appropriate.
2617 * That's funny, Linux has one built in! Use it!
2618 */
2619
2620 extern inline u32 tcp_init_seq(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2621 {
2622 struct timeval tv;
2623 do_gettimeofday(&tv);
2624 return tv.tv_usec+tv.tv_sec*1000000;
2625 }
2626
2627 /*
2628 * This routine handles a connection request.
2629 * It should make sure we haven't already responded.
2630 * Because of the way BSD works, we have to send a syn/ack now.
2631 * This also means it will be harder to close a socket which is
2632 * listening.
2633 */
2634
2635 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2636 unsigned long daddr, unsigned long saddr,
2637 struct options *opt, struct device *dev, u32 seq)
2638 {
2639 struct sk_buff *buff;
2640 struct tcphdr *t1;
2641 unsigned char *ptr;
2642 struct sock *newsk;
2643 struct tcphdr *th;
2644 struct device *ndev=NULL;
2645 int tmp;
2646 struct rtable *rt;
2647
2648 th = skb->h.th;
2649
2650 /* If the socket is dead, don't accept the connection. */
2651 if (!sk->dead)
2652 {
2653 sk->data_ready(sk,0);
2654 }
2655 else
2656 {
2657 if(sk->debug)
2658 printk("Reset on %p: Connect on dead socket.\n",sk);
2659 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2660 tcp_statistics.TcpAttemptFails++;
2661 kfree_skb(skb, FREE_READ);
2662 return;
2663 }
2664
2665 /*
2666 * Make sure we can accept more. This will prevent a
2667 * flurry of syns from eating up all our memory.
2668 */
2669
2670 if (sk->ack_backlog >= sk->max_ack_backlog)
2671 {
2672 tcp_statistics.TcpAttemptFails++;
2673 kfree_skb(skb, FREE_READ);
2674 return;
2675 }
2676
2677 /*
2678 * We need to build a new sock struct.
2679 * It is sort of bad to have a socket without an inode attached
2680 * to it, but the wake_up's will just wake up the listening socket,
2681 * and if the listening socket is destroyed before this is taken
2682 * off of the queue, this will take care of it.
2683 */
2684
2685 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2686 if (newsk == NULL)
2687 {
2688 /* just ignore the syn. It will get retransmitted. */
2689 tcp_statistics.TcpAttemptFails++;
2690 kfree_skb(skb, FREE_READ);
2691 return;
2692 }
2693
2694 memcpy(newsk, sk, sizeof(*newsk));
2695 skb_queue_head_init(&newsk->write_queue);
2696 skb_queue_head_init(&newsk->receive_queue);
2697 newsk->send_head = NULL;
2698 newsk->send_tail = NULL;
2699 skb_queue_head_init(&newsk->back_log);
2700 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
2701 newsk->rto = TCP_TIMEOUT_INIT;
2702 newsk->mdev = 0;
2703 newsk->max_window = 0;
2704 newsk->cong_window = 1;
2705 newsk->cong_count = 0;
2706 newsk->ssthresh = 0;
2707 newsk->backoff = 0;
2708 newsk->blog = 0;
2709 newsk->intr = 0;
2710 newsk->proc = 0;
2711 newsk->done = 0;
2712 newsk->partial = NULL;
2713 newsk->pair = NULL;
2714 newsk->wmem_alloc = 0;
2715 newsk->rmem_alloc = 0;
2716 newsk->localroute = sk->localroute;
2717
2718 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2719
2720 newsk->err = 0;
2721 newsk->shutdown = 0;
2722 newsk->ack_backlog = 0;
2723 newsk->acked_seq = skb->h.th->seq+1;
2724 newsk->copied_seq = skb->h.th->seq+1;
2725 newsk->fin_seq = skb->h.th->seq;
2726 newsk->state = TCP_SYN_RECV;
2727 newsk->timeout = 0;
2728 newsk->ip_xmit_timeout = 0;
2729 newsk->write_seq = seq;
2730 newsk->window_seq = newsk->write_seq;
2731 newsk->rcv_ack_seq = newsk->write_seq;
2732 newsk->urg_data = 0;
2733 newsk->retransmits = 0;
2734 newsk->linger=0;
2735 newsk->destroy = 0;
2736 init_timer(&newsk->timer);
2737 newsk->timer.data = (unsigned long)newsk;
2738 newsk->timer.function = &net_timer;
2739 init_timer(&newsk->retransmit_timer);
2740 newsk->retransmit_timer.data = (unsigned long)newsk;
2741 newsk->retransmit_timer.function=&retransmit_timer;
2742 newsk->dummy_th.source = skb->h.th->dest;
2743 newsk->dummy_th.dest = skb->h.th->source;
2744
2745 /*
2746 * Swap these two, they are from our point of view.
2747 */
2748
2749 newsk->daddr = saddr;
2750 newsk->saddr = daddr;
2751
2752 put_sock(newsk->num,newsk);
2753 newsk->dummy_th.res1 = 0;
2754 newsk->dummy_th.doff = 6;
2755 newsk->dummy_th.fin = 0;
2756 newsk->dummy_th.syn = 0;
2757 newsk->dummy_th.rst = 0;
2758 newsk->dummy_th.psh = 0;
2759 newsk->dummy_th.ack = 0;
2760 newsk->dummy_th.urg = 0;
2761 newsk->dummy_th.res2 = 0;
2762 newsk->acked_seq = skb->h.th->seq + 1;
2763 newsk->copied_seq = skb->h.th->seq + 1;
2764 newsk->socket = NULL;
2765
2766 /*
2767 * Grab the ttl and tos values and use them
2768 */
2769
2770 newsk->ip_ttl=sk->ip_ttl;
2771 newsk->ip_tos=skb->ip_hdr->tos;
2772
2773 /*
2774 * Use 512 or whatever user asked for
2775 */
2776
2777 /*
2778 * Note use of sk->user_mss, since user has no direct access to newsk
2779 */
2780
2781 rt=ip_rt_route(saddr, NULL,NULL);
2782
2783 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2784 newsk->window_clamp = rt->rt_window;
2785 else
2786 newsk->window_clamp = 0;
2787
2788 if (sk->user_mss)
2789 newsk->mtu = sk->user_mss;
2790 else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2791 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2792 else
2793 {
2794 #ifdef CONFIG_INET_SNARL /* Sub Nets Are Local */
2795 if ((saddr ^ daddr) & default_mask(saddr))
2796 #else
2797 if ((saddr ^ daddr) & dev->pa_mask)
2798 #endif
2799 newsk->mtu = 576 - HEADER_SIZE;
2800 else
2801 newsk->mtu = MAX_WINDOW;
2802 }
2803
2804 /*
2805 * But not bigger than device MTU
2806 */
2807
2808 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2809
2810 /*
2811 * This will min with what arrived in the packet
2812 */
2813
2814 tcp_options(newsk,skb->h.th);
2815
2816 tcp_cache_zap();
2817
2818 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2819 if (buff == NULL)
2820 {
2821 sk->err = ENOMEM;
2822 newsk->dead = 1;
2823 newsk->state = TCP_CLOSE;
2824 /* And this will destroy it */
2825 release_sock(newsk);
2826 kfree_skb(skb, FREE_READ);
2827 tcp_statistics.TcpAttemptFails++;
2828 return;
2829 }
2830
2831 buff->sk = newsk;
2832 buff->localroute = newsk->localroute;
2833
2834 /*
2835 * Put in the IP header and routing stuff.
2836 */
2837
2838 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2839 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2840
2841 /*
2842 * Something went wrong.
2843 */
2844
2845 if (tmp < 0)
2846 {
2847 sk->err = tmp;
2848 buff->free = 1;
2849 kfree_skb(buff,FREE_WRITE);
2850 newsk->dead = 1;
2851 newsk->state = TCP_CLOSE;
2852 release_sock(newsk);
2853 skb->sk = sk;
2854 kfree_skb(skb, FREE_READ);
2855 tcp_statistics.TcpAttemptFails++;
2856 return;
2857 }
2858
2859 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2860
2861 memcpy(t1, skb->h.th, sizeof(*t1));
2862 buff->h.seq = newsk->write_seq;
2863 /*
2864 * Swap the send and the receive.
2865 */
2866 t1->dest = skb->h.th->source;
2867 t1->source = newsk->dummy_th.source;
2868 t1->seq = ntohl(newsk->write_seq++);
2869 t1->ack = 1;
2870 newsk->window = tcp_select_window(newsk);
2871 newsk->sent_seq = newsk->write_seq;
2872 t1->window = ntohs(newsk->window);
2873 t1->res1 = 0;
2874 t1->res2 = 0;
2875 t1->rst = 0;
2876 t1->urg = 0;
2877 t1->psh = 0;
2878 t1->syn = 1;
2879 t1->ack_seq = ntohl(skb->h.th->seq+1);
2880 t1->doff = sizeof(*t1)/4+1;
2881 ptr = skb_put(buff,4);
2882 ptr[0] = 2;
2883 ptr[1] = 4;
2884 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2885 ptr[3] =(newsk->mtu) & 0xff;
2886
2887 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2888 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2889 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2890 skb->sk = newsk;
2891
2892 /*
2893 * Charge the sock_buff to newsk.
2894 */
2895
2896 sk->rmem_alloc -= skb->truesize;
2897 newsk->rmem_alloc += skb->truesize;
2898
2899 skb_queue_tail(&sk->receive_queue,skb);
2900 sk->ack_backlog++;
2901 release_sock(newsk);
2902 tcp_statistics.TcpOutSegs++;
2903 }
2904
2905
2906 static void tcp_close(struct sock *sk, int timeout)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2907 {
2908 /*
2909 * We need to grab some memory, and put together a FIN,
2910 * and then put it into the queue to be sent.
2911 */
2912
2913 sk->inuse = 1;
2914
2915 if(th_cache_sk==sk)
2916 tcp_cache_zap();
2917 if(sk->state == TCP_LISTEN)
2918 {
2919 /* Special case */
2920 tcp_set_state(sk, TCP_CLOSE);
2921 tcp_close_pending(sk);
2922 release_sock(sk);
2923 return;
2924 }
2925
2926 sk->keepopen = 1;
2927 sk->shutdown = SHUTDOWN_MASK;
2928
2929 if (!sk->dead)
2930 sk->state_change(sk);
2931
2932 if (timeout == 0)
2933 {
2934 struct sk_buff *skb;
2935
2936 /*
2937 * We need to flush the recv. buffs. We do this only on the
2938 * descriptor close, not protocol-sourced closes, because the
2939 * reader process may not have drained the data yet!
2940 */
2941
2942 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2943 kfree_skb(skb, FREE_READ);
2944 /*
2945 * Get rid off any half-completed packets.
2946 */
2947
2948 if (sk->partial)
2949 tcp_send_partial(sk);
2950 }
2951
2952
2953 /*
2954 * Timeout is not the same thing - however the code likes
2955 * to send both the same way (sigh).
2956 */
2957
2958 if(timeout)
2959 {
2960 tcp_set_state(sk, TCP_CLOSE); /* Dead */
2961 }
2962 else
2963 {
2964 if(tcp_close_state(sk,1)==1)
2965 {
2966 tcp_send_fin(sk);
2967 }
2968 }
2969 release_sock(sk);
2970 }
2971
2972
2973 /*
2974 * This routine takes stuff off of the write queue,
2975 * and puts it in the xmit queue. This happens as incoming acks
2976 * open up the remote window for us.
2977 */
2978
2979 static void tcp_write_xmit(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2980 {
2981 struct sk_buff *skb;
2982
2983 /*
2984 * The bytes will have to remain here. In time closedown will
2985 * empty the write queue and all will be happy
2986 */
2987
2988 if(sk->zapped)
2989 return;
2990
2991 /*
2992 * Anything on the transmit queue that fits the window can
2993 * be added providing we are not
2994 *
2995 * a) retransmitting (Nagle's rule)
2996 * b) exceeding our congestion window.
2997 */
2998
2999 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3000 before(skb->h.seq, sk->window_seq + 1) &&
3001 (sk->retransmits == 0 ||
3002 sk->ip_xmit_timeout != TIME_WRITE ||
3003 before(skb->h.seq, sk->rcv_ack_seq + 1))
3004 && sk->packets_out < sk->cong_window)
3005 {
3006 IS_SKB(skb);
3007 skb_unlink(skb);
3008
3009 /*
3010 * See if we really need to send the packet.
3011 */
3012
3013 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3014 {
3015 /*
3016 * This is acked data. We can discard it. This
3017 * cannot currently occur.
3018 */
3019
3020 sk->retransmits = 0;
3021 kfree_skb(skb, FREE_WRITE);
3022 if (!sk->dead)
3023 sk->write_space(sk);
3024 }
3025 else
3026 {
3027 struct tcphdr *th;
3028 struct iphdr *iph;
3029 int size;
3030 /*
3031 * put in the ack seq and window at this point rather than earlier,
3032 * in order to keep them monotonic. We really want to avoid taking
3033 * back window allocations. That's legal, but RFC1122 says it's frowned on.
3034 * Ack and window will in general have changed since this packet was put
3035 * on the write queue.
3036 */
3037 iph = (struct iphdr *)(skb->data +
3038 skb->dev->hard_header_len);
3039 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3040 size = skb->len - (((unsigned char *) th) - skb->data);
3041
3042 th->ack_seq = ntohl(sk->acked_seq);
3043 th->window = ntohs(tcp_select_window(sk));
3044
3045 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3046
3047 sk->sent_seq = skb->h.seq;
3048
3049 /*
3050 * IP manages our queue for some crazy reason
3051 */
3052
3053 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3054
3055 /*
3056 * Again we slide the timer wrongly
3057 */
3058
3059 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3060 }
3061 }
3062 }
3063
3064
3065 /*
3066 * This routine deals with incoming acks, but not outgoing ones.
3067 */
3068
3069 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3070 {
3071 u32 ack;
3072 int flag = 0;
3073
3074 /*
3075 * 1 - there was data in packet as well as ack or new data is sent or
3076 * in shutdown state
3077 * 2 - data from retransmit queue was acked and removed
3078 * 4 - window shrunk or data from retransmit queue was acked and removed
3079 */
3080
3081 if(sk->zapped)
3082 return(1); /* Dead, cant ack any more so why bother */
3083
3084 /*
3085 * Have we discovered a larger window
3086 */
3087
3088 ack = ntohl(th->ack_seq);
3089
3090 if (ntohs(th->window) > sk->max_window)
3091 {
3092 sk->max_window = ntohs(th->window);
3093 #ifdef CONFIG_INET_PCTCP
3094 /* Hack because we don't send partial packets to non SWS
3095 handling hosts */
3096 sk->mss = min(sk->max_window>>1, sk->mtu);
3097 #else
3098 sk->mss = min(sk->max_window, sk->mtu);
3099 #endif
3100 }
3101
3102 /*
3103 * We have dropped back to keepalive timeouts. Thus we have
3104 * no retransmits pending.
3105 */
3106
3107 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3108 sk->retransmits = 0;
3109
3110 /*
3111 * If the ack is newer than sent or older than previous acks
3112 * then we can probably ignore it.
3113 */
3114
3115 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3116 {
3117 if(sk->debug)
3118 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3119
3120 /*
3121 * Keepalive processing.
3122 */
3123
3124 if (after(ack, sk->sent_seq))
3125 {
3126 return(0);
3127 }
3128
3129 /*
3130 * Restart the keepalive timer.
3131 */
3132
3133 if (sk->keepopen)
3134 {
3135 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3136 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3137 }
3138 return(1);
3139 }
3140
3141 /*
3142 * If there is data set flag 1
3143 */
3144
3145 if (len != th->doff*4)
3146 flag |= 1;
3147
3148 /*
3149 * See if our window has been shrunk.
3150 */
3151
3152 if (after(sk->window_seq, ack+ntohs(th->window)))
3153 {
3154 /*
3155 * We may need to move packets from the send queue
3156 * to the write queue, if the window has been shrunk on us.
3157 * The RFC says you are not allowed to shrink your window
3158 * like this, but if the other end does, you must be able
3159 * to deal with it.
3160 */
3161 struct sk_buff *skb;
3162 struct sk_buff *skb2;
3163 struct sk_buff *wskb = NULL;
3164
3165 skb2 = sk->send_head;
3166 sk->send_head = NULL;
3167 sk->send_tail = NULL;
3168
3169 /*
3170 * This is an artifact of a flawed concept. We want one
3171 * queue and a smarter send routine when we send all.
3172 */
3173
3174 flag |= 4; /* Window changed */
3175
3176 sk->window_seq = ack + ntohs(th->window);
3177 cli();
3178 while (skb2 != NULL)
3179 {
3180 skb = skb2;
3181 skb2 = skb->link3;
3182 skb->link3 = NULL;
3183 if (after(skb->h.seq, sk->window_seq))
3184 {
3185 if (sk->packets_out > 0)
3186 sk->packets_out--;
3187 /* We may need to remove this from the dev send list. */
3188 if (skb->next != NULL)
3189 {
3190 skb_unlink(skb);
3191 }
3192 /* Now add it to the write_queue. */
3193 if (wskb == NULL)
3194 skb_queue_head(&sk->write_queue,skb);
3195 else
3196 skb_append(wskb,skb);
3197 wskb = skb;
3198 }
3199 else
3200 {
3201 if (sk->send_head == NULL)
3202 {
3203 sk->send_head = skb;
3204 sk->send_tail = skb;
3205 }
3206 else
3207 {
3208 sk->send_tail->link3 = skb;
3209 sk->send_tail = skb;
3210 }
3211 skb->link3 = NULL;
3212 }
3213 }
3214 sti();
3215 }
3216
3217 /*
3218 * Pipe has emptied
3219 */
3220
3221 if (sk->send_tail == NULL || sk->send_head == NULL)
3222 {
3223 sk->send_head = NULL;
3224 sk->send_tail = NULL;
3225 sk->packets_out= 0;
3226 }
3227
3228 /*
3229 * Update the right hand window edge of the host
3230 */
3231
3232 sk->window_seq = ack + ntohs(th->window);
3233
3234 /*
3235 * We don't want too many packets out there.
3236 */
3237
3238 if (sk->ip_xmit_timeout == TIME_WRITE &&
3239 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3240 {
3241 /*
3242 * This is Jacobson's slow start and congestion avoidance.
3243 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
3244 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
3245 * counter and increment it once every cwnd times. It's possible
3246 * that this should be done only if sk->retransmits == 0. I'm
3247 * interpreting "new data is acked" as including data that has
3248 * been retransmitted but is just now being acked.
3249 */
3250 if (sk->cong_window < sk->ssthresh)
3251 /*
3252 * In "safe" area, increase
3253 */
3254 sk->cong_window++;
3255 else
3256 {
3257 /*
3258 * In dangerous area, increase slowly. In theory this is
3259 * sk->cong_window += 1 / sk->cong_window
3260 */
3261 if (sk->cong_count >= sk->cong_window)
3262 {
3263 sk->cong_window++;
3264 sk->cong_count = 0;
3265 }
3266 else
3267 sk->cong_count++;
3268 }
3269 }
3270
3271 /*
3272 * Remember the highest ack received.
3273 */
3274
3275 sk->rcv_ack_seq = ack;
3276
3277 /*
3278 * If this ack opens up a zero window, clear backoff. It was
3279 * being used to time the probes, and is probably far higher than
3280 * it needs to be for normal retransmission.
3281 */
3282
3283 if (sk->ip_xmit_timeout == TIME_PROBE0)
3284 {
3285 sk->retransmits = 0; /* Our probe was answered */
3286
3287 /*
3288 * Was it a usable window open ?
3289 */
3290
3291 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
3292 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3293 {
3294 sk->backoff = 0;
3295
3296 /*
3297 * Recompute rto from rtt. this eliminates any backoff.
3298 */
3299
3300 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3301 if (sk->rto > 120*HZ)
3302 sk->rto = 120*HZ;
3303 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about
3304 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3305 .2 of a second is going to need huge windows (SIGH) */
3306 sk->rto = 20;
3307 }
3308 }
3309
3310 /*
3311 * See if we can take anything off of the retransmit queue.
3312 */
3313
3314 while(sk->send_head != NULL)
3315 {
3316 /* Check for a bug. */
3317 if (sk->send_head->link3 &&
3318 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3319 printk("INET: tcp.c: *** bug send_list out of order.\n");
3320
3321 /*
3322 * If our packet is before the ack sequence we can
3323 * discard it as it's confirmed to have arrived the other end.
3324 */
3325
3326 if (before(sk->send_head->h.seq, ack+1))
3327 {
3328 struct sk_buff *oskb;
3329 if (sk->retransmits)
3330 {
3331 /*
3332 * We were retransmitting. don't count this in RTT est
3333 */
3334 flag |= 2;
3335
3336 /*
3337 * even though we've gotten an ack, we're still
3338 * retransmitting as long as we're sending from
3339 * the retransmit queue. Keeping retransmits non-zero
3340 * prevents us from getting new data interspersed with
3341 * retransmissions.
3342 */
3343
3344 if (sk->send_head->link3) /* Any more queued retransmits? */
3345 sk->retransmits = 1;
3346 else
3347 sk->retransmits = 0;
3348 }
3349 /*
3350 * Note that we only reset backoff and rto in the
3351 * rtt recomputation code. And that doesn't happen
3352 * if there were retransmissions in effect. So the
3353 * first new packet after the retransmissions is
3354 * sent with the backoff still in effect. Not until
3355 * we get an ack from a non-retransmitted packet do
3356 * we reset the backoff and rto. This allows us to deal
3357 * with a situation where the network delay has increased
3358 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3359 */
3360
3361 /*
3362 * We have one less packet out there.
3363 */
3364
3365 if (sk->packets_out > 0)
3366 sk->packets_out --;
3367 /*
3368 * Wake up the process, it can probably write more.
3369 */
3370 if (!sk->dead)
3371 sk->write_space(sk);
3372 oskb = sk->send_head;
3373
3374 if (!(flag&2)) /* Not retransmitting */
3375 {
3376 long m;
3377
3378 /*
3379 * The following amusing code comes from Jacobson's
3380 * article in SIGCOMM '88. Note that rtt and mdev
3381 * are scaled versions of rtt and mean deviation.
3382 * This is designed to be as fast as possible
3383 * m stands for "measurement".
3384 */
3385
3386 m = jiffies - oskb->when; /* RTT */
3387 if(m<=0)
3388 m=1; /* IS THIS RIGHT FOR <0 ??? */
3389 m -= (sk->rtt >> 3); /* m is now error in rtt est */
3390 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
3391 if (m < 0)
3392 m = -m; /* m is now abs(error) */
3393 m -= (sk->mdev >> 2); /* similar update on mdev */
3394 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
3395
3396 /*
3397 * Now update timeout. Note that this removes any backoff.
3398 */
3399
3400 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3401 if (sk->rto > 120*HZ)
3402 sk->rto = 120*HZ;
3403 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3404 sk->rto = 20;
3405 sk->backoff = 0;
3406 }
3407 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
3408 In this case as we just set it up */
3409 cli();
3410 oskb = sk->send_head;
3411 IS_SKB(oskb);
3412 sk->send_head = oskb->link3;
3413 if (sk->send_head == NULL)
3414 {
3415 sk->send_tail = NULL;
3416 }
3417
3418 /*
3419 * We may need to remove this from the dev send list.
3420 */
3421
3422 if (oskb->next)
3423 skb_unlink(oskb);
3424 sti();
3425 kfree_skb(oskb, FREE_WRITE); /* write. */
3426 if (!sk->dead)
3427 sk->write_space(sk);
3428 }
3429 else
3430 {
3431 break;
3432 }
3433 }
3434
3435 /*
3436 * XXX someone ought to look at this too.. at the moment, if skb_peek()
3437 * returns non-NULL, we complete ignore the timer stuff in the else
3438 * clause. We ought to organize the code so that else clause can
3439 * (should) be executed regardless, possibly moving the PROBE timer
3440 * reset over. The skb_peek() thing should only move stuff to the
3441 * write queue, NOT also manage the timer functions.
3442 */
3443
3444 /*
3445 * Maybe we can take some stuff off of the write queue,
3446 * and put it onto the xmit queue.
3447 */
3448 if (skb_peek(&sk->write_queue) != NULL)
3449 {
3450 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3451 (sk->retransmits == 0 ||
3452 sk->ip_xmit_timeout != TIME_WRITE ||
3453 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3454 && sk->packets_out < sk->cong_window)
3455 {
3456 /*
3457 * Add more data to the send queue.
3458 */
3459 flag |= 1;
3460 tcp_write_xmit(sk);
3461 }
3462 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3463 sk->send_head == NULL &&
3464 sk->ack_backlog == 0 &&
3465 sk->state != TCP_TIME_WAIT)
3466 {
3467 /*
3468 * Data to queue but no room.
3469 */
3470 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3471 }
3472 }
3473 else
3474 {
3475 /*
3476 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3477 * from TCP_CLOSE we don't do anything
3478 *
3479 * from anything else, if there is write data (or fin) pending,
3480 * we use a TIME_WRITE timeout, else if keepalive we reset to
3481 * a KEEPALIVE timeout, else we delete the timer.
3482 *
3483 * We do not set flag for nominal write data, otherwise we may
3484 * force a state where we start to write itsy bitsy tidbits
3485 * of data.
3486 */
3487
3488 switch(sk->state) {
3489 case TCP_TIME_WAIT:
3490 /*
3491 * keep us in TIME_WAIT until we stop getting packets,
3492 * reset the timeout.
3493 */
3494 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3495 break;
3496 case TCP_CLOSE:
3497 /*
3498 * don't touch the timer.
3499 */
3500 break;
3501 default:
3502 /*
3503 * Must check send_head, write_queue, and ack_backlog
3504 * to determine which timeout to use.
3505 */
3506 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3507 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3508 } else if (sk->keepopen) {
3509 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3510 } else {
3511 del_timer(&sk->retransmit_timer);
3512 sk->ip_xmit_timeout = 0;
3513 }
3514 break;
3515 }
3516 }
3517
3518 /*
3519 * We have nothing queued but space to send. Send any partial
3520 * packets immediately (end of Nagle rule application).
3521 */
3522
3523 if (sk->packets_out == 0 && sk->partial != NULL &&
3524 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3525 {
3526 flag |= 1;
3527 tcp_send_partial(sk);
3528 }
3529
3530 /*
3531 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
3532 * we are now waiting for an acknowledge to our FIN. The other end is
3533 * already in TIME_WAIT.
3534 *
3535 * Move to TCP_CLOSE on success.
3536 */
3537
3538 if (sk->state == TCP_LAST_ACK)
3539 {
3540 if (!sk->dead)
3541 sk->state_change(sk);
3542 if(sk->debug)
3543 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3544 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3545 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
3546 {
3547 flag |= 1;
3548 tcp_set_state(sk,TCP_CLOSE);
3549 sk->shutdown = SHUTDOWN_MASK;
3550 }
3551 }
3552
3553 /*
3554 * Incoming ACK to a FIN we sent in the case of our initiating the close.
3555 *
3556 * Move to FIN_WAIT2 to await a FIN from the other end. Set
3557 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3558 */
3559
3560 if (sk->state == TCP_FIN_WAIT1)
3561 {
3562
3563 if (!sk->dead)
3564 sk->state_change(sk);
3565 if (sk->rcv_ack_seq == sk->write_seq)
3566 {
3567 flag |= 1;
3568 sk->shutdown |= SEND_SHUTDOWN;
3569 tcp_set_state(sk, TCP_FIN_WAIT2);
3570 }
3571 }
3572
3573 /*
3574 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
3575 *
3576 * Move to TIME_WAIT
3577 */
3578
3579 if (sk->state == TCP_CLOSING)
3580 {
3581
3582 if (!sk->dead)
3583 sk->state_change(sk);
3584 if (sk->rcv_ack_seq == sk->write_seq)
3585 {
3586 flag |= 1;
3587 tcp_time_wait(sk);
3588 }
3589 }
3590
3591 /*
3592 * Final ack of a three way shake
3593 */
3594
3595 if(sk->state==TCP_SYN_RECV)
3596 {
3597 tcp_set_state(sk, TCP_ESTABLISHED);
3598 tcp_options(sk,th);
3599 sk->dummy_th.dest=th->source;
3600 sk->copied_seq = sk->acked_seq;
3601 if(!sk->dead)
3602 sk->state_change(sk);
3603 if(sk->max_window==0)
3604 {
3605 sk->max_window=32; /* Sanity check */
3606 sk->mss=min(sk->max_window,sk->mtu);
3607 }
3608 }
3609
3610 /*
3611 * I make no guarantees about the first clause in the following
3612 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
3613 * what conditions "!flag" would be true. However I think the rest
3614 * of the conditions would prevent that from causing any
3615 * unnecessary retransmission.
3616 * Clearly if the first packet has expired it should be
3617 * retransmitted. The other alternative, "flag&2 && retransmits", is
3618 * harder to explain: You have to look carefully at how and when the
3619 * timer is set and with what timeout. The most recent transmission always
3620 * sets the timer. So in general if the most recent thing has timed
3621 * out, everything before it has as well. So we want to go ahead and
3622 * retransmit some more. If we didn't explicitly test for this
3623 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3624 * would not be true. If you look at the pattern of timing, you can
3625 * show that rto is increased fast enough that the next packet would
3626 * almost never be retransmitted immediately. Then you'd end up
3627 * waiting for a timeout to send each packet on the retransmission
3628 * queue. With my implementation of the Karn sampling algorithm,
3629 * the timeout would double each time. The net result is that it would
3630 * take a hideous amount of time to recover from a single dropped packet.
3631 * It's possible that there should also be a test for TIME_WRITE, but
3632 * I think as long as "send_head != NULL" and "retransmit" is on, we've
3633 * got to be in real retransmission mode.
3634 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
3635 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3636 * As long as no further losses occur, this seems reasonable.
3637 */
3638
3639 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3640 (((flag&2) && sk->retransmits) ||
3641 (sk->send_head->when + sk->rto < jiffies)))
3642 {
3643 if(sk->send_head->when + sk->rto < jiffies)
3644 tcp_retransmit(sk,0);
3645 else
3646 {
3647 tcp_do_retransmit(sk, 1);
3648 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3649 }
3650 }
3651
3652 return(1);
3653 }
3654
3655
3656 /*
3657 * Process the FIN bit. This now behaves as it is supposed to work
3658 * and the FIN takes effect when it is validly part of sequence
3659 * space. Not before when we get holes.
3660 *
3661 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3662 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
3663 * TIME-WAIT)
3664 *
3665 * If we are in FINWAIT-1, a received FIN indicates simultaneous
3666 * close and we go into CLOSING (and later onto TIME-WAIT)
3667 *
3668 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3669 *
3670 */
3671
3672 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3673 {
3674 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3675
3676 if (!sk->dead)
3677 {
3678 sk->state_change(sk);
3679 sock_wake_async(sk->socket, 1);
3680 }
3681
3682 switch(sk->state)
3683 {
3684 case TCP_SYN_RECV:
3685 case TCP_SYN_SENT:
3686 case TCP_ESTABLISHED:
3687 /*
3688 * move to CLOSE_WAIT, tcp_data() already handled
3689 * sending the ack.
3690 */
3691 tcp_set_state(sk,TCP_CLOSE_WAIT);
3692 if (th->rst)
3693 sk->shutdown = SHUTDOWN_MASK;
3694 break;
3695
3696 case TCP_CLOSE_WAIT:
3697 case TCP_CLOSING:
3698 /*
3699 * received a retransmission of the FIN, do
3700 * nothing.
3701 */
3702 break;
3703 case TCP_TIME_WAIT:
3704 /*
3705 * received a retransmission of the FIN,
3706 * restart the TIME_WAIT timer.
3707 */
3708 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3709 return(0);
3710 case TCP_FIN_WAIT1:
3711 /*
3712 * This case occurs when a simultaneous close
3713 * happens, we must ack the received FIN and
3714 * enter the CLOSING state.
3715 *
3716 * This causes a WRITE timeout, which will either
3717 * move on to TIME_WAIT when we timeout, or resend
3718 * the FIN properly (maybe we get rid of that annoying
3719 * FIN lost hang). The TIME_WRITE code is already correct
3720 * for handling this timeout.
3721 */
3722
3723 if(sk->ip_xmit_timeout != TIME_WRITE)
3724 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3725 tcp_set_state(sk,TCP_CLOSING);
3726 break;
3727 case TCP_FIN_WAIT2:
3728 /*
3729 * received a FIN -- send ACK and enter TIME_WAIT
3730 */
3731 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3732 sk->shutdown|=SHUTDOWN_MASK;
3733 tcp_set_state(sk,TCP_TIME_WAIT);
3734 break;
3735 case TCP_CLOSE:
3736 /*
3737 * already in CLOSE
3738 */
3739 break;
3740 default:
3741 tcp_set_state(sk,TCP_LAST_ACK);
3742
3743 /* Start the timers. */
3744 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3745 return(0);
3746 }
3747
3748 return(0);
3749 }
3750
3751
3752
3753 /*
3754 * This routine handles the data. If there is room in the buffer,
3755 * it will be have already been moved into it. If there is no
3756 * room, then we will just have to discard the packet.
3757 */
3758
3759 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3760 unsigned long saddr, unsigned short len)
3761 {
3762 struct sk_buff *skb1, *skb2;
3763 struct tcphdr *th;
3764 int dup_dumped=0;
3765 u32 new_seq, shut_seq;
3766
3767 th = skb->h.th;
3768 skb_pull(skb,th->doff*4);
3769 skb_trim(skb,len-(th->doff*4));
3770
3771 /*
3772 * The bytes in the receive read/assembly queue has increased. Needed for the
3773 * low memory discard algorithm
3774 */
3775
3776 sk->bytes_rcv += skb->len;
3777
3778 if (skb->len == 0 && !th->fin)
3779 {
3780 /*
3781 * Don't want to keep passing ack's back and forth.
3782 * (someone sent us dataless, boring frame)
3783 */
3784 if (!th->ack)
3785 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3786 kfree_skb(skb, FREE_READ);
3787 return(0);
3788 }
3789
3790 /*
3791 * We no longer have anyone receiving data on this connection.
3792 */
3793
3794 #ifndef TCP_DONT_RST_SHUTDOWN
3795
3796 if(sk->shutdown & RCV_SHUTDOWN)
3797 {
3798 /*
3799 * FIXME: BSD has some magic to avoid sending resets to
3800 * broken 4.2 BSD keepalives. Much to my surprise a few non
3801 * BSD stacks still have broken keepalives so we want to
3802 * cope with it.
3803 */
3804
3805 if(skb->len) /* We don't care if it's just an ack or
3806 a keepalive/window probe */
3807 {
3808 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
3809
3810 /* Do this the way 4.4BSD treats it. Not what I'd
3811 regard as the meaning of the spec but it's what BSD
3812 does and clearly they know everything 8) */
3813
3814 /*
3815 * This is valid because of two things
3816 *
3817 * a) The way tcp_data behaves at the bottom.
3818 * b) A fin takes effect when read not when received.
3819 */
3820
3821 shut_seq=sk->acked_seq+1; /* Last byte */
3822
3823 if(after(new_seq,shut_seq))
3824 {
3825 if(sk->debug)
3826 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
3827 sk, new_seq, shut_seq, sk->blog);
3828 if(sk->dead)
3829 {
3830 sk->acked_seq = new_seq + th->fin;
3831 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3832 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3833 tcp_statistics.TcpEstabResets++;
3834 tcp_set_state(sk,TCP_CLOSE);
3835 sk->err = EPIPE;
3836 sk->shutdown = SHUTDOWN_MASK;
3837 kfree_skb(skb, FREE_READ);
3838 return 0;
3839 }
3840 }
3841 }
3842 }
3843
3844 #endif
3845
3846 /*
3847 * Now we have to walk the chain, and figure out where this one
3848 * goes into it. This is set up so that the last packet we received
3849 * will be the first one we look at, that way if everything comes
3850 * in order, there will be no performance loss, and if they come
3851 * out of order we will be able to fit things in nicely.
3852 *
3853 * [AC: This is wrong. We should assume in order first and then walk
3854 * forwards from the first hole based upon real traffic patterns.]
3855 *
3856 */
3857
3858 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
3859 {
3860 skb_queue_head(&sk->receive_queue,skb);
3861 skb1= NULL;
3862 }
3863 else
3864 {
3865 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3866 {
3867 if(sk->debug)
3868 {
3869 printk("skb1=%p :", skb1);
3870 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
3871 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
3872 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
3873 sk->acked_seq);
3874 }
3875
3876 /*
3877 * Optimisation: Duplicate frame or extension of previous frame from
3878 * same sequence point (lost ack case).
3879 * The frame contains duplicate data or replaces a previous frame
3880 * discard the previous frame (safe as sk->inuse is set) and put
3881 * the new one in its place.
3882 */
3883
3884 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3885 {
3886 skb_append(skb1,skb);
3887 skb_unlink(skb1);
3888 kfree_skb(skb1,FREE_READ);
3889 dup_dumped=1;
3890 skb1=NULL;
3891 break;
3892 }
3893
3894 /*
3895 * Found where it fits
3896 */
3897
3898 if (after(th->seq+1, skb1->h.th->seq))
3899 {
3900 skb_append(skb1,skb);
3901 break;
3902 }
3903
3904 /*
3905 * See if we've hit the start. If so insert.
3906 */
3907 if (skb1 == skb_peek(&sk->receive_queue))
3908 {
3909 skb_queue_head(&sk->receive_queue, skb);
3910 break;
3911 }
3912 }
3913 }
3914
3915 /*
3916 * Figure out what the ack value for this frame is
3917 */
3918
3919 th->ack_seq = th->seq + skb->len;
3920 if (th->syn)
3921 th->ack_seq++;
3922 if (th->fin)
3923 th->ack_seq++;
3924
3925 if (before(sk->acked_seq, sk->copied_seq))
3926 {
3927 printk("*** tcp.c:tcp_data bug acked < copied\n");
3928 sk->acked_seq = sk->copied_seq;
3929 }
3930
3931 /*
3932 * Now figure out if we can ack anything. This is very messy because we really want two
3933 * receive queues, a completed and an assembly queue. We also want only one transmit
3934 * queue.
3935 */
3936
3937 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3938 {
3939 if (before(th->seq, sk->acked_seq+1))
3940 {
3941 int newwindow;
3942
3943 if (after(th->ack_seq, sk->acked_seq))
3944 {
3945 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3946 if (newwindow < 0)
3947 newwindow = 0;
3948 sk->window = newwindow;
3949 sk->acked_seq = th->ack_seq;
3950 }
3951 skb->acked = 1;
3952
3953 /*
3954 * When we ack the fin, we do the FIN
3955 * processing.
3956 */
3957
3958 if (skb->h.th->fin)
3959 {
3960 tcp_fin(skb,sk,skb->h.th);
3961 }
3962
3963 for(skb2 = skb->next;
3964 skb2 != (struct sk_buff *)&sk->receive_queue;
3965 skb2 = skb2->next)
3966 {
3967 if (before(skb2->h.th->seq, sk->acked_seq+1))
3968 {
3969 if (after(skb2->h.th->ack_seq, sk->acked_seq))
3970 {
3971 newwindow = sk->window -
3972 (skb2->h.th->ack_seq - sk->acked_seq);
3973 if (newwindow < 0)
3974 newwindow = 0;
3975 sk->window = newwindow;
3976 sk->acked_seq = skb2->h.th->ack_seq;
3977 }
3978 skb2->acked = 1;
3979 /*
3980 * When we ack the fin, we do
3981 * the fin handling.
3982 */
3983 if (skb2->h.th->fin)
3984 {
3985 tcp_fin(skb,sk,skb->h.th);
3986 }
3987
3988 /*
3989 * Force an immediate ack.
3990 */
3991
3992 sk->ack_backlog = sk->max_ack_backlog;
3993 }
3994 else
3995 {
3996 break;
3997 }
3998 }
3999
4000 /*
4001 * This also takes care of updating the window.
4002 * This if statement needs to be simplified.
4003 */
4004 if (!sk->delay_acks ||
4005 sk->ack_backlog >= sk->max_ack_backlog ||
4006 sk->bytes_rcv > sk->max_unacked || th->fin) {
4007 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4008 }
4009 else
4010 {
4011 sk->ack_backlog++;
4012 if(sk->debug)
4013 printk("Ack queued.\n");
4014 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4015 }
4016 }
4017 }
4018
4019 /*
4020 * If we've missed a packet, send an ack.
4021 * Also start a timer to send another.
4022 */
4023
4024 if (!skb->acked)
4025 {
4026
4027 /*
4028 * This is important. If we don't have much room left,
4029 * we need to throw out a few packets so we have a good
4030 * window. Note that mtu is used, not mss, because mss is really
4031 * for the send side. He could be sending us stuff as large as mtu.
4032 */
4033
4034 while (sk->prot->rspace(sk) < sk->mtu)
4035 {
4036 skb1 = skb_peek(&sk->receive_queue);
4037 if (skb1 == NULL)
4038 {
4039 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4040 break;
4041 }
4042
4043 /*
4044 * Don't throw out something that has been acked.
4045 */
4046
4047 if (skb1->acked)
4048 {
4049 break;
4050 }
4051
4052 skb_unlink(skb1);
4053 kfree_skb(skb1, FREE_READ);
4054 }
4055 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4056 sk->ack_backlog++;
4057 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4058 }
4059 else
4060 {
4061 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4062 }
4063
4064 /*
4065 * Now tell the user we may have some data.
4066 */
4067
4068 if (!sk->dead)
4069 {
4070 if(sk->debug)
4071 printk("Data wakeup.\n");
4072 sk->data_ready(sk,0);
4073 }
4074 return(0);
4075 }
4076
4077
4078 /*
4079 * This routine is only called when we have urgent data
4080 * signalled. Its the 'slow' part of tcp_urg. It could be
4081 * moved inline now as tcp_urg is only called from one
4082 * place. We handle URGent data wrong. We have to - as
4083 * BSD still doesn't use the correction from RFC961.
4084 */
4085
4086 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4087 {
4088 u32 ptr = ntohs(th->urg_ptr);
4089
4090 if (ptr)
4091 ptr--;
4092 ptr += th->seq;
4093
4094 /* ignore urgent data that we've already seen and read */
4095 if (after(sk->copied_seq, ptr))
4096 return;
4097
4098 /* do we already have a newer (or duplicate) urgent pointer? */
4099 if (sk->urg_data && !after(ptr, sk->urg_seq))
4100 return;
4101
4102 /* tell the world about our new urgent pointer */
4103 if (sk->proc != 0) {
4104 if (sk->proc > 0) {
4105 kill_proc(sk->proc, SIGURG, 1);
4106 } else {
4107 kill_pg(-sk->proc, SIGURG, 1);
4108 }
4109 }
4110 sk->urg_data = URG_NOTYET;
4111 sk->urg_seq = ptr;
4112 }
4113
4114 /*
4115 * This is the 'fast' part of urgent handling.
4116 */
4117
4118 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4119 unsigned long saddr, unsigned long len)
4120 {
4121 u32 ptr;
4122
4123 /*
4124 * Check if we get a new urgent pointer - normally not
4125 */
4126
4127 if (th->urg)
4128 tcp_check_urg(sk,th);
4129
4130 /*
4131 * Do we wait for any urgent data? - normally not
4132 */
4133
4134 if (sk->urg_data != URG_NOTYET)
4135 return 0;
4136
4137 /*
4138 * Is the urgent pointer pointing into this packet?
4139 */
4140
4141 ptr = sk->urg_seq - th->seq + th->doff*4;
4142 if (ptr >= len)
4143 return 0;
4144
4145 /*
4146 * Ok, got the correct packet, update info
4147 */
4148
4149 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4150 if (!sk->dead)
4151 sk->data_ready(sk,0);
4152 return 0;
4153 }
4154
4155 /*
4156 * This will accept the next outstanding connection.
4157 */
4158
4159 static struct sock *tcp_accept(struct sock *sk, int flags)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4160 {
4161 struct sock *newsk;
4162 struct sk_buff *skb;
4163
4164 /*
4165 * We need to make sure that this socket is listening,
4166 * and that it has something pending.
4167 */
4168
4169 if (sk->state != TCP_LISTEN)
4170 {
4171 sk->err = EINVAL;
4172 return(NULL);
4173 }
4174
4175 /* Avoid the race. */
4176 cli();
4177 sk->inuse = 1;
4178
4179 while((skb = tcp_dequeue_established(sk)) == NULL)
4180 {
4181 if (flags & O_NONBLOCK)
4182 {
4183 sti();
4184 release_sock(sk);
4185 sk->err = EAGAIN;
4186 return(NULL);
4187 }
4188
4189 release_sock(sk);
4190 interruptible_sleep_on(sk->sleep);
4191 if (current->signal & ~current->blocked)
4192 {
4193 sti();
4194 sk->err = ERESTARTSYS;
4195 return(NULL);
4196 }
4197 sk->inuse = 1;
4198 }
4199 sti();
4200
4201 /*
4202 * Now all we need to do is return skb->sk.
4203 */
4204
4205 newsk = skb->sk;
4206
4207 kfree_skb(skb, FREE_READ);
4208 sk->ack_backlog--;
4209 release_sock(sk);
4210 return(newsk);
4211 }
4212
4213
4214 /*
4215 * This will initiate an outgoing connection.
4216 */
4217
4218 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4219 {
4220 struct sk_buff *buff;
4221 struct device *dev=NULL;
4222 unsigned char *ptr;
4223 int tmp;
4224 int atype;
4225 struct tcphdr *t1;
4226 struct rtable *rt;
4227
4228 if (sk->state != TCP_CLOSE)
4229 {
4230 return(-EISCONN);
4231 }
4232
4233 if (addr_len < 8)
4234 return(-EINVAL);
4235
4236 if (usin->sin_family && usin->sin_family != AF_INET)
4237 return(-EAFNOSUPPORT);
4238
4239 /*
4240 * connect() to INADDR_ANY means loopback (BSD'ism).
4241 */
4242
4243 if(usin->sin_addr.s_addr==INADDR_ANY)
4244 usin->sin_addr.s_addr=ip_my_addr();
4245
4246 /*
4247 * Don't want a TCP connection going to a broadcast address
4248 */
4249
4250 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4251 return -ENETUNREACH;
4252
4253 sk->inuse = 1;
4254 sk->daddr = usin->sin_addr.s_addr;
4255 sk->write_seq = tcp_init_seq();
4256 sk->window_seq = sk->write_seq;
4257 sk->rcv_ack_seq = sk->write_seq -1;
4258 sk->err = 0;
4259 sk->dummy_th.dest = usin->sin_port;
4260 release_sock(sk);
4261
4262 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4263 if (buff == NULL)
4264 {
4265 return(-ENOMEM);
4266 }
4267 sk->inuse = 1;
4268 buff->sk = sk;
4269 buff->free = 0;
4270 buff->localroute = sk->localroute;
4271
4272
4273 /*
4274 * Put in the IP header and routing stuff.
4275 */
4276
4277 rt=ip_rt_route(sk->daddr, NULL, NULL);
4278
4279
4280 /*
4281 * We need to build the routing stuff from the things saved in skb.
4282 */
4283
4284 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4285 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4286 if (tmp < 0)
4287 {
4288 sk->prot->wfree(sk, buff);
4289 release_sock(sk);
4290 return(-ENETUNREACH);
4291 }
4292
4293 t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4294
4295 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4296 t1->seq = ntohl(sk->write_seq++);
4297 sk->sent_seq = sk->write_seq;
4298 buff->h.seq = sk->write_seq;
4299 t1->ack = 0;
4300 t1->window = 2;
4301 t1->res1=0;
4302 t1->res2=0;
4303 t1->rst = 0;
4304 t1->urg = 0;
4305 t1->psh = 0;
4306 t1->syn = 1;
4307 t1->urg_ptr = 0;
4308 t1->doff = 6;
4309 /* use 512 or whatever user asked for */
4310
4311 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4312 sk->window_clamp=rt->rt_window;
4313 else
4314 sk->window_clamp=0;
4315
4316 if (sk->user_mss)
4317 sk->mtu = sk->user_mss;
4318 else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4319 sk->mtu = rt->rt_mss;
4320 else
4321 {
4322 #ifdef CONFIG_INET_SNARL
4323 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4324 #else
4325 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4326 #endif
4327 sk->mtu = 576 - HEADER_SIZE;
4328 else
4329 sk->mtu = MAX_WINDOW;
4330 }
4331 /*
4332 * but not bigger than device MTU
4333 */
4334
4335 if(sk->mtu <32)
4336 sk->mtu = 32; /* Sanity limit */
4337
4338 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4339
4340 /*
4341 * Put in the TCP options to say MTU.
4342 */
4343
4344 ptr = skb_put(buff,4);
4345 ptr[0] = 2;
4346 ptr[1] = 4;
4347 ptr[2] = (sk->mtu) >> 8;
4348 ptr[3] = (sk->mtu) & 0xff;
4349 tcp_send_check(t1, sk->saddr, sk->daddr,
4350 sizeof(struct tcphdr) + 4, sk);
4351
4352 /*
4353 * This must go first otherwise a really quick response will get reset.
4354 */
4355
4356 tcp_cache_zap();
4357 tcp_set_state(sk,TCP_SYN_SENT);
4358 if(rt&&rt->rt_flags&RTF_IRTT)
4359 sk->rto = rt->rt_irtt;
4360 else
4361 sk->rto = TCP_TIMEOUT_INIT;
4362 sk->retransmit_timer.function=&retransmit_timer;
4363 sk->retransmit_timer.data = (unsigned long)sk;
4364 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
4365 sk->retransmits = 0; /* Now works the right way instead of a hacked initial setting */
4366
4367 sk->prot->queue_xmit(sk, dev, buff, 0);
4368 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4369 tcp_statistics.TcpActiveOpens++;
4370 tcp_statistics.TcpOutSegs++;
4371
4372 release_sock(sk);
4373 return(0);
4374 }
4375
4376
4377 /* This functions checks to see if the tcp header is actually acceptable. */
4378 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4379 struct options *opt, unsigned long saddr, struct device *dev)
4380 {
4381 u32 next_seq;
4382
4383 next_seq = len - 4*th->doff;
4384 if (th->fin)
4385 next_seq++;
4386 /* if we have a zero window, we can't have any data in the packet.. */
4387 if (next_seq && !sk->window)
4388 goto ignore_it;
4389 next_seq += th->seq;
4390
4391 /*
4392 * This isn't quite right. sk->acked_seq could be more recent
4393 * than sk->window. This is however close enough. We will accept
4394 * slightly more packets than we should, but it should not cause
4395 * problems unless someone is trying to forge packets.
4396 */
4397
4398 /* have we already seen all of this packet? */
4399 if (!after(next_seq+1, sk->acked_seq))
4400 goto ignore_it;
4401 /* or does it start beyond the window? */
4402 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4403 goto ignore_it;
4404
4405 /* ok, at least part of this packet would seem interesting.. */
4406 return 1;
4407
4408 ignore_it:
4409 if (th->rst)
4410 return 0;
4411
4412 /*
4413 * Send a reset if we get something not ours and we are
4414 * unsynchronized. Note: We don't do anything to our end. We
4415 * are just killing the bogus remote connection then we will
4416 * connect again and it will work (with luck).
4417 */
4418
4419 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4420 {
4421 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4422 return 1;
4423 }
4424
4425 /* Try to resync things. */
4426 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4427 return 0;
4428 }
4429
4430 /*
4431 * When we get a reset we do this.
4432 */
4433
4434 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4435 {
4436 sk->zapped = 1;
4437 sk->err = ECONNRESET;
4438 if (sk->state == TCP_SYN_SENT)
4439 sk->err = ECONNREFUSED;
4440 if (sk->state == TCP_CLOSE_WAIT)
4441 sk->err = EPIPE;
4442 #ifdef TCP_DO_RFC1337
4443 /*
4444 * Time wait assassination protection [RFC1337]
4445 */
4446 if(sk->state!=TCP_TIME_WAIT)
4447 {
4448 tcp_set_state(sk,TCP_CLOSE);
4449 sk->shutdown = SHUTDOWN_MASK;
4450 }
4451 #else
4452 tcp_set_state(sk,TCP_CLOSE);
4453 sk->shutdown = SHUTDOWN_MASK;
4454 #endif
4455 if (!sk->dead)
4456 sk->state_change(sk);
4457 kfree_skb(skb, FREE_READ);
4458 release_sock(sk);
4459 return(0);
4460 }
4461
4462 /*
4463 * A TCP packet has arrived.
4464 * skb->h.raw is the TCP header.
4465 */
4466
4467 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4468 unsigned long daddr, unsigned short len,
4469 unsigned long saddr, int redo, struct inet_protocol * protocol)
4470 {
4471 struct tcphdr *th;
4472 struct sock *sk;
4473 int syn_ok=0;
4474
4475 tcp_statistics.TcpInSegs++;
4476 if(skb->pkt_type!=PACKET_HOST)
4477 {
4478 kfree_skb(skb,FREE_READ);
4479 return(0);
4480 }
4481
4482 th = skb->h.th;
4483
4484 /*
4485 * Find the socket, using the last hit cache if applicable.
4486 */
4487
4488 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4489 sk=(struct sock *)th_cache_sk;
4490 else
4491 {
4492 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4493 th_cache_saddr=saddr;
4494 th_cache_daddr=daddr;
4495 th_cache_dport=th->dest;
4496 th_cache_sport=th->source;
4497 th_cache_sk=sk;
4498 }
4499
4500 /*
4501 * If this socket has got a reset it's to all intents and purposes
4502 * really dead. Count closed sockets as dead.
4503 *
4504 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4505 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
4506 * exist so should cause resets as if the port was unreachable.
4507 */
4508
4509 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4510 sk=NULL;
4511
4512 if (!redo)
4513 {
4514 /*
4515 * Pull up the IP header.
4516 */
4517 skb_pull(skb, skb->h.raw-skb->data);
4518 /*
4519 * Try to use the device checksum if provided.
4520 */
4521 if (
4522 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4523 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4524 )
4525 {
4526 skb->sk = NULL;
4527 kfree_skb(skb,FREE_READ);
4528 /*
4529 * We don't release the socket because it was
4530 * never marked in use.
4531 */
4532 return(0);
4533 }
4534 th->seq = ntohl(th->seq);
4535
4536 /* See if we know about the socket. */
4537 if (sk == NULL)
4538 {
4539 /*
4540 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4541 */
4542 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4543 skb->sk = NULL;
4544 /*
4545 * Discard frame
4546 */
4547 kfree_skb(skb, FREE_READ);
4548 return(0);
4549 }
4550
4551 /* skb->len = len;*/
4552 skb->acked = 0;
4553 skb->used = 0;
4554 skb->free = 0;
4555 skb->saddr = daddr;
4556 skb->daddr = saddr;
4557
4558 /* We may need to add it to the backlog here. */
4559 cli();
4560 if (sk->inuse)
4561 {
4562 skb_queue_tail(&sk->back_log, skb);
4563 sti();
4564 return(0);
4565 }
4566 sk->inuse = 1;
4567 sti();
4568 }
4569 else
4570 {
4571 if (sk==NULL)
4572 {
4573 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4574 skb->sk = NULL;
4575 kfree_skb(skb, FREE_READ);
4576 return(0);
4577 }
4578 }
4579
4580
4581 if (!sk->prot)
4582 {
4583 printk("IMPOSSIBLE 3\n");
4584 return(0);
4585 }
4586
4587
4588 /*
4589 * Charge the memory to the socket.
4590 */
4591
4592 if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf)
4593 {
4594 kfree_skb(skb, FREE_READ);
4595 release_sock(sk);
4596 return(0);
4597 }
4598
4599 skb->sk=sk;
4600 sk->rmem_alloc += skb->truesize;
4601
4602 /*
4603 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4604 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4605 * compatibility. We also set up variables more thoroughly [Karn notes in the
4606 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4607 */
4608
4609 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
4610 {
4611
4612 /*
4613 * Now deal with unusual cases.
4614 */
4615
4616 if(sk->state==TCP_LISTEN)
4617 {
4618 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
4619 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4620
4621 /*
4622 * We don't care for RST, and non SYN are absorbed (old segments)
4623 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4624 * netmask on a running connection it can go broadcast. Even Sun's have
4625 * this problem so I'm ignoring it
4626 */
4627
4628 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4629 {
4630 kfree_skb(skb, FREE_READ);
4631 release_sock(sk);
4632 return 0;
4633 }
4634
4635 /*
4636 * Guess we need to make a new socket up
4637 */
4638
4639 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4640
4641 /*
4642 * Now we have several options: In theory there is nothing else
4643 * in the frame. KA9Q has an option to send data with the syn,
4644 * BSD accepts data with the syn up to the [to be] advertised window
4645 * and Solaris 2.1 gives you a protocol error. For now we just ignore
4646 * it, that fits the spec precisely and avoids incompatibilities. It
4647 * would be nice in future to drop through and process the data.
4648 */
4649
4650 release_sock(sk);
4651 return 0;
4652 }
4653
4654 /* retransmitted SYN? */
4655 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4656 {
4657 kfree_skb(skb, FREE_READ);
4658 release_sock(sk);
4659 return 0;
4660 }
4661
4662 /*
4663 * SYN sent means we have to look for a suitable ack and either reset
4664 * for bad matches or go to connected
4665 */
4666
4667 if(sk->state==TCP_SYN_SENT)
4668 {
4669 /* Crossed SYN or previous junk segment */
4670 if(th->ack)
4671 {
4672 /* We got an ack, but it's not a good ack */
4673 if(!tcp_ack(sk,th,saddr,len))
4674 {
4675 /* Reset the ack - its an ack from a
4676 different connection [ th->rst is checked in tcp_reset()] */
4677 tcp_statistics.TcpAttemptFails++;
4678 tcp_reset(daddr, saddr, th,
4679 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4680 kfree_skb(skb, FREE_READ);
4681 release_sock(sk);
4682 return(0);
4683 }
4684 if(th->rst)
4685 return tcp_std_reset(sk,skb);
4686 if(!th->syn)
4687 {
4688 /* A valid ack from a different connection
4689 start. Shouldn't happen but cover it */
4690 kfree_skb(skb, FREE_READ);
4691 release_sock(sk);
4692 return 0;
4693 }
4694 /*
4695 * Ok.. it's good. Set up sequence numbers and
4696 * move to established.
4697 */
4698 syn_ok=1; /* Don't reset this connection for the syn */
4699 sk->acked_seq=th->seq+1;
4700 sk->fin_seq=th->seq;
4701 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4702 tcp_set_state(sk, TCP_ESTABLISHED);
4703 tcp_options(sk,th);
4704 sk->dummy_th.dest=th->source;
4705 sk->copied_seq = sk->acked_seq;
4706 if(!sk->dead)
4707 {
4708 sk->state_change(sk);
4709 sock_wake_async(sk->socket, 0);
4710 }
4711 if(sk->max_window==0)
4712 {
4713 sk->max_window = 32;
4714 sk->mss = min(sk->max_window, sk->mtu);
4715 }
4716 }
4717 else
4718 {
4719 /* See if SYN's cross. Drop if boring */
4720 if(th->syn && !th->rst)
4721 {
4722 /* Crossed SYN's are fine - but talking to
4723 yourself is right out... */
4724 if(sk->saddr==saddr && sk->daddr==daddr &&
4725 sk->dummy_th.source==th->source &&
4726 sk->dummy_th.dest==th->dest)
4727 {
4728 tcp_statistics.TcpAttemptFails++;
4729 return tcp_std_reset(sk,skb);
4730 }
4731 tcp_set_state(sk,TCP_SYN_RECV);
4732
4733 /*
4734 * FIXME:
4735 * Must send SYN|ACK here
4736 */
4737 }
4738 /* Discard junk segment */
4739 kfree_skb(skb, FREE_READ);
4740 release_sock(sk);
4741 return 0;
4742 }
4743 /*
4744 * SYN_RECV with data maybe.. drop through
4745 */
4746 goto rfc_step6;
4747 }
4748
4749 /*
4750 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4751 * a more complex suggestion for fixing these reuse issues in RFC1644
4752 * but not yet ready for general use. Also see RFC1379.
4753 */
4754
4755 #define BSD_TIME_WAIT
4756 #ifdef BSD_TIME_WAIT
4757 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4758 after(th->seq, sk->acked_seq) && !th->rst)
4759 {
4760 u32 seq = sk->write_seq;
4761 if(sk->debug)
4762 printk("Doing a BSD time wait\n");
4763 tcp_statistics.TcpEstabResets++;
4764 sk->rmem_alloc -= skb->truesize;
4765 skb->sk = NULL;
4766 sk->err=ECONNRESET;
4767 tcp_set_state(sk, TCP_CLOSE);
4768 sk->shutdown = SHUTDOWN_MASK;
4769 release_sock(sk);
4770 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4771 if (sk && sk->state==TCP_LISTEN)
4772 {
4773 sk->inuse=1;
4774 skb->sk = sk;
4775 sk->rmem_alloc += skb->truesize;
4776 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4777 release_sock(sk);
4778 return 0;
4779 }
4780 kfree_skb(skb, FREE_READ);
4781 return 0;
4782 }
4783 #endif
4784 }
4785
4786 /*
4787 * We are now in normal data flow (see the step list in the RFC)
4788 * Note most of these are inline now. I'll inline the lot when
4789 * I have time to test it hard and look at what gcc outputs
4790 */
4791
4792 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4793 {
4794 kfree_skb(skb, FREE_READ);
4795 release_sock(sk);
4796 return 0;
4797 }
4798
4799 if(th->rst)
4800 return tcp_std_reset(sk,skb);
4801
4802 /*
4803 * !syn_ok is effectively the state test in RFC793.
4804 */
4805
4806 if(th->syn && !syn_ok)
4807 {
4808 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4809 return tcp_std_reset(sk,skb);
4810 }
4811
4812 /*
4813 * Process the ACK
4814 */
4815
4816
4817 if(th->ack && !tcp_ack(sk,th,saddr,len))
4818 {
4819 /*
4820 * Our three way handshake failed.
4821 */
4822
4823 if(sk->state==TCP_SYN_RECV)
4824 {
4825 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4826 }
4827 kfree_skb(skb, FREE_READ);
4828 release_sock(sk);
4829 return 0;
4830 }
4831
4832 rfc_step6: /* I'll clean this up later */
4833
4834 /*
4835 * Process urgent data
4836 */
4837
4838 if(tcp_urg(sk, th, saddr, len))
4839 {
4840 kfree_skb(skb, FREE_READ);
4841 release_sock(sk);
4842 return 0;
4843 }
4844
4845
4846 /*
4847 * Process the encapsulated data
4848 */
4849
4850 if(tcp_data(skb,sk, saddr, len))
4851 {
4852 kfree_skb(skb, FREE_READ);
4853 release_sock(sk);
4854 return 0;
4855 }
4856
4857 /*
4858 * And done
4859 */
4860
4861 release_sock(sk);
4862 return 0;
4863 }
4864
4865 /*
4866 * This routine sends a packet with an out of date sequence
4867 * number. It assumes the other end will try to ack it.
4868 */
4869
4870 static void tcp_write_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4871 {
4872 struct sk_buff *buff,*skb;
4873 struct tcphdr *t1;
4874 struct device *dev=NULL;
4875 int tmp;
4876
4877 if (sk->zapped)
4878 return; /* After a valid reset we can send no more */
4879
4880 /*
4881 * Write data can still be transmitted/retransmitted in the
4882 * following states. If any other state is encountered, return.
4883 * [listen/close will never occur here anyway]
4884 */
4885
4886 if (sk->state != TCP_ESTABLISHED &&
4887 sk->state != TCP_CLOSE_WAIT &&
4888 sk->state != TCP_FIN_WAIT1 &&
4889 sk->state != TCP_LAST_ACK &&
4890 sk->state != TCP_CLOSING
4891 )
4892 {
4893 return;
4894 }
4895 if ( before(sk->sent_seq, sk->window_seq) &&
4896 (skb=skb_peek(&sk->write_queue)))
4897 {
4898 /*
4899 * We are probing the opening of a window
4900 * but the window size is != 0
4901 * must have been a result SWS advoidance ( sender )
4902 */
4903
4904 struct iphdr *iph;
4905 struct tcphdr *th;
4906 struct tcphdr *nth;
4907 unsigned long win_size, ow_size;
4908 void * tcp_data_start;
4909
4910 /*
4911 * How many bytes can we send ?
4912 */
4913
4914 win_size = sk->window_seq - sk->sent_seq;
4915
4916 /*
4917 * Recover the buffer pointers
4918 */
4919
4920 iph = (struct iphdr *)(skb->data + skb->dev->hard_header_len);
4921 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
4922
4923 /*
4924 * Grab the data for a temporary frame
4925 */
4926
4927 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 +
4928 (iph->ihl << 2) +
4929 skb->dev->hard_header_len + 15,
4930 1, GFP_ATOMIC);
4931 if ( buff == NULL )
4932 return;
4933
4934 /*
4935 * If we strip the packet on the write queue we must
4936 * be ready to retransmit this one
4937 */
4938
4939 buff->free = /*0*/1;
4940
4941 buff->sk = sk;
4942 buff->localroute = sk->localroute;
4943
4944 /*
4945 * Put headers on the new packet
4946 */
4947
4948 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4949 IPPROTO_TCP, sk->opt, buff->truesize,
4950 sk->ip_tos,sk->ip_ttl);
4951 if (tmp < 0)
4952 {
4953 sk->prot->wfree(sk, buff);
4954 return;
4955 }
4956
4957 /*
4958 * Move the TCP header over
4959 */
4960
4961 buff->dev = dev;
4962
4963 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
4964
4965 memcpy(nth, th, th->doff * 4);
4966
4967 /*
4968 * Correct the new header
4969 */
4970
4971 nth->ack = 1;
4972 nth->ack_seq = ntohl(sk->acked_seq);
4973 nth->window = ntohs(tcp_select_window(sk));
4974 nth->check = 0;
4975
4976 /*
4977 * Find the first data byte.
4978 */
4979
4980 tcp_data_start = skb->data + skb->dev->hard_header_len +
4981 (iph->ihl << 2) + th->doff * 4;
4982
4983 /*
4984 * Add it to our new buffer
4985 */
4986 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
4987
4988 /*
4989 * Remember our right edge sequence number.
4990 */
4991
4992 buff->h.seq = sk->sent_seq + win_size;
4993 sk->sent_seq = buff->h.seq; /* Hack */
4994 #if 0
4995
4996 /*
4997 * now: shrink the queue head segment
4998 */
4999
5000 th->check = 0;
5001 ow_size = skb->len - win_size -
5002 ((unsigned long) (tcp_data_start - (void *) skb->data));
5003
5004 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5005 skb_trim(skb,skb->len-win_size);
5006 sk->sent_seq += win_size;
5007 th->seq = htonl(sk->sent_seq);
5008 if (th->urg)
5009 {
5010 unsigned short urg_ptr;
5011
5012 urg_ptr = ntohs(th->urg_ptr);
5013 if (urg_ptr <= win_size)
5014 th->urg = 0;
5015 else
5016 {
5017 urg_ptr -= win_size;
5018 th->urg_ptr = htons(urg_ptr);
5019 nth->urg_ptr = htons(win_size);
5020 }
5021 }
5022 #else
5023 if(th->urg && ntohs(th->urg_ptr) < win_size)
5024 nth->urg = 0;
5025 #endif
5026
5027 /*
5028 * Checksum the split buffer
5029 */
5030
5031 tcp_send_check(nth, sk->saddr, sk->daddr,
5032 nth->doff * 4 + win_size , sk);
5033 }
5034 else
5035 {
5036 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5037 if (buff == NULL)
5038 return;
5039
5040 buff->free = 1;
5041 buff->sk = sk;
5042 buff->localroute = sk->localroute;
5043
5044 /*
5045 * Put in the IP header and routing stuff.
5046 */
5047
5048 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5049 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5050 if (tmp < 0)
5051 {
5052 sk->prot->wfree(sk, buff);
5053 return;
5054 }
5055
5056 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5057 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5058
5059 /*
5060 * Use a previous sequence.
5061 * This should cause the other end to send an ack.
5062 */
5063
5064 t1->seq = htonl(sk->sent_seq-1);
5065 t1->ack = 1;
5066 t1->res1= 0;
5067 t1->res2= 0;
5068 t1->rst = 0;
5069 t1->urg = 0;
5070 t1->psh = 0;
5071 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5072 t1->syn = 0;
5073 t1->ack_seq = ntohl(sk->acked_seq);
5074 t1->window = ntohs(tcp_select_window(sk));
5075 t1->doff = sizeof(*t1)/4;
5076 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5077
5078 }
5079
5080 /*
5081 * Send it.
5082 */
5083
5084 sk->prot->queue_xmit(sk, dev, buff, 1);
5085 tcp_statistics.TcpOutSegs++;
5086 }
5087
5088 /*
5089 * A window probe timeout has occurred.
5090 */
5091
5092 void tcp_send_probe0(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5093 {
5094 if (sk->zapped)
5095 return; /* After a valid reset we can send no more */
5096
5097 tcp_write_wakeup(sk);
5098
5099 sk->backoff++;
5100 sk->rto = min(sk->rto << 1, 120*HZ);
5101 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5102 sk->retransmits++;
5103 sk->prot->retransmits ++;
5104 }
5105
5106 /*
5107 * Socket option code for TCP.
5108 */
5109
5110 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5111 {
5112 int val,err;
5113
5114 if(level!=SOL_TCP)
5115 return ip_setsockopt(sk,level,optname,optval,optlen);
5116
5117 if (optval == NULL)
5118 return(-EINVAL);
5119
5120 err=verify_area(VERIFY_READ, optval, sizeof(int));
5121 if(err)
5122 return err;
5123
5124 val = get_user((int *)optval);
5125
5126 switch(optname)
5127 {
5128 case TCP_MAXSEG:
5129 /*
5130 * values greater than interface MTU won't take effect. however at
5131 * the point when this call is done we typically don't yet know
5132 * which interface is going to be used
5133 */
5134 if(val<1||val>MAX_WINDOW)
5135 return -EINVAL;
5136 sk->user_mss=val;
5137 return 0;
5138 case TCP_NODELAY:
5139 sk->nonagle=(val==0)?0:1;
5140 return 0;
5141 default:
5142 return(-ENOPROTOOPT);
5143 }
5144 }
5145
5146 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5147 {
5148 int val,err;
5149
5150 if(level!=SOL_TCP)
5151 return ip_getsockopt(sk,level,optname,optval,optlen);
5152
5153 switch(optname)
5154 {
5155 case TCP_MAXSEG:
5156 val=sk->user_mss;
5157 break;
5158 case TCP_NODELAY:
5159 val=sk->nonagle;
5160 break;
5161 default:
5162 return(-ENOPROTOOPT);
5163 }
5164 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5165 if(err)
5166 return err;
5167 put_user(sizeof(int),(int *) optlen);
5168
5169 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5170 if(err)
5171 return err;
5172 put_user(val,(int *)optval);
5173
5174 return(0);
5175 }
5176
5177
5178 struct proto tcp_prot = {
5179 sock_wmalloc,
5180 sock_rmalloc,
5181 sock_wfree,
5182 sock_rfree,
5183 sock_rspace,
5184 sock_wspace,
5185 tcp_close,
5186 tcp_read,
5187 tcp_write,
5188 tcp_sendto,
5189 tcp_recvfrom,
5190 ip_build_header,
5191 tcp_connect,
5192 tcp_accept,
5193 ip_queue_xmit,
5194 tcp_retransmit,
5195 tcp_write_wakeup,
5196 tcp_read_wakeup,
5197 tcp_rcv,
5198 tcp_select,
5199 tcp_ioctl,
5200 NULL,
5201 tcp_shutdown,
5202 tcp_setsockopt,
5203 tcp_getsockopt,
5204 128,
5205 0,
5206 "TCP",
5207 0, 0,
5208 {NULL,}
5209 };